summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c6
-rw-r--r--fs/9p/vfs_inode_dotl.c16
-rw-r--r--fs/9p/xattr.c8
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/adfs/inode.c13
-rw-r--r--fs/affs/amigaffs.c4
-rw-r--r--fs/affs/inode.c17
-rw-r--r--fs/afs/dynroot.c2
-rw-r--r--fs/afs/inode.c8
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/afs/xattr.c2
-rw-r--r--fs/aio.c2
-rw-r--r--fs/attr.c4
-rw-r--r--fs/autofs/autofs_i.h20
-rw-r--r--fs/autofs/init.c9
-rw-r--r--fs/autofs/inode.c437
-rw-r--r--fs/autofs/root.c6
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/bcachefs/fs.c12
-rw-r--r--fs/befs/linuxvfs.c10
-rw-r--r--fs/bfs/dir.c9
-rw-r--r--fs/bfs/inode.c12
-rw-r--r--fs/binfmt_elf_fdpic.c5
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/btrfs/Kconfig23
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/accessors.h16
-rw-r--r--fs/btrfs/async-thread.c12
-rw-r--r--fs/btrfs/async-thread.h6
-rw-r--r--fs/btrfs/backref.c19
-rw-r--r--fs/btrfs/backref.h13
-rw-r--r--fs/btrfs/bio.c47
-rw-r--r--fs/btrfs/block-group.c186
-rw-r--r--fs/btrfs/block-rsv.c24
-rw-r--r--fs/btrfs/btrfs_inode.h80
-rw-r--r--fs/btrfs/check-integrity.c2871
-rw-r--r--fs/btrfs/check-integrity.h20
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c381
-rw-r--r--fs/btrfs/ctree.h145
-rw-r--r--fs/btrfs/defrag.c152
-rw-r--r--fs/btrfs/defrag.h2
-rw-r--r--fs/btrfs/delalloc-space.c6
-rw-r--r--fs/btrfs/delayed-inode.c153
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/delayed-ref.c241
-rw-r--r--fs/btrfs/delayed-ref.h71
-rw-r--r--fs/btrfs/dev-replace.c17
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/dir-item.h9
-rw-r--r--fs/btrfs/disk-io.c164
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/extent-io-tree.c272
-rw-r--r--fs/btrfs/extent-io-tree.h7
-rw-r--r--fs/btrfs/extent-tree.c552
-rw-r--r--fs/btrfs/extent-tree.h15
-rw-r--r--fs/btrfs/extent_io.c56
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c109
-rw-r--r--fs/btrfs/free-space-cache.c28
-rw-r--r--fs/btrfs/free-space-tree.c17
-rw-r--r--fs/btrfs/fs.h69
-rw-r--r--fs/btrfs/inode-item.c21
-rw-r--r--fs/btrfs/inode-item.h8
-rw-r--r--fs/btrfs/inode.c259
-rw-r--r--fs/btrfs/ioctl.c53
-rw-r--r--fs/btrfs/locking.c20
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/messages.c32
-rw-r--r--fs/btrfs/messages.h14
-rw-r--r--fs/btrfs/ordered-data.c129
-rw-r--r--fs/btrfs/ordered-data.h17
-rw-r--r--fs/btrfs/print-tree.c35
-rw-r--r--fs/btrfs/props.c1
-rw-r--r--fs/btrfs/qgroup.c872
-rw-r--r--fs/btrfs/qgroup.h149
-rw-r--r--fs/btrfs/raid-stripe-tree.c274
-rw-r--r--fs/btrfs/raid-stripe-tree.h50
-rw-r--r--fs/btrfs/ref-verify.c9
-rw-r--r--fs/btrfs/reflink.c5
-rw-r--r--fs/btrfs/relocation.c215
-rw-r--r--fs/btrfs/relocation.h9
-rw-r--r--fs/btrfs/root-tree.c12
-rw-r--r--fs/btrfs/root-tree.h8
-rw-r--r--fs/btrfs/scrub.c78
-rw-r--r--fs/btrfs/send.c6
-rw-r--r--fs/btrfs/space-info.c64
-rw-r--r--fs/btrfs/space-info.h3
-rw-r--r--fs/btrfs/super.c94
-rw-r--r--fs/btrfs/sysfs.c53
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c6
-rw-r--r--fs/btrfs/tests/inode-tests.c12
-rw-r--r--fs/btrfs/transaction.c271
-rw-r--r--fs/btrfs/transaction.h25
-rw-r--r--fs/btrfs/tree-checker.c48
-rw-r--r--fs/btrfs/tree-log.c95
-rw-r--r--fs/btrfs/ulist.c3
-rw-r--r--fs/btrfs/uuid-tree.c6
-rw-r--r--fs/btrfs/verity.c68
-rw-r--r--fs/btrfs/volumes.c539
-rw-r--r--fs/btrfs/volumes.h45
-rw-r--r--fs/btrfs/xattr.c14
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zoned.c452
-rw-r--r--fs/btrfs/zstd.c11
-rw-r--r--fs/buffer.c36
-rw-r--r--fs/ceph/addr.c10
-rw-r--r--fs/ceph/caps.c4
-rw-r--r--fs/ceph/crypto.c9
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c68
-rw-r--r--fs/ceph/mds_client.c10
-rw-r--r--fs/ceph/snap.c4
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/ceph/xattr.c2
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/coda/coda_linux.c6
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/configfs/inode.c8
-rw-r--r--fs/cramfs/inode.c6
-rw-r--r--fs/crypto/bio.c39
-rw-r--r--fs/crypto/crypto.c163
-rw-r--r--fs/crypto/fname.c6
-rw-r--r--fs/crypto/fscrypt_private.h164
-rw-r--r--fs/crypto/hooks.c4
-rw-r--r--fs/crypto/inline_crypt.c32
-rw-r--r--fs/crypto/keyring.c82
-rw-r--r--fs/crypto/keysetup.c62
-rw-r--r--fs/crypto/keysetup_v1.c20
-rw-r--r--fs/crypto/policy.c83
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/devpts/inode.c6
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/efivarfs/file.c2
-rw-r--r--fs/efivarfs/inode.c2
-rw-r--r--fs/efivarfs/super.c14
-rw-r--r--fs/efs/inode.c5
-rw-r--r--fs/erofs/data.c4
-rw-r--r--fs/erofs/decompressor_lzma.c5
-rw-r--r--fs/erofs/inode.c3
-rw-r--r--fs/erofs/internal.h2
-rw-r--r--fs/erofs/super.c22
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/xattr.h4
-rw-r--r--fs/exfat/exfat_fs.h1
-rw-r--r--fs/exfat/file.c7
-rw-r--r--fs/exfat/inode.c31
-rw-r--r--fs/exfat/misc.c8
-rw-r--r--fs/exfat/namei.c31
-rw-r--r--fs/exfat/super.c4
-rw-r--r--fs/ext2/dir.c6
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c13
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c4
-rw-r--r--fs/ext2/xattr.h2
-rw-r--r--fs/ext4/crypto.c13
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/extents.c11
-rw-r--r--fs/ext4/fsmap.c9
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c19
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/mballoc.c54
-rw-r--r--fs/ext4/namei.c36
-rw-r--r--fs/ext4/super.c56
-rw-r--r--fs/ext4/xattr.c10
-rw-r--r--fs/ext4/xattr.h2
-rw-r--r--fs/f2fs/dir.c6
-rw-r--r--fs/f2fs/f2fs.h11
-rw-r--r--fs/f2fs/file.c14
-rw-r--r--fs/f2fs/inline.c2
-rw-r--r--fs/f2fs/inode.c24
-rw-r--r--fs/f2fs/namei.c4
-rw-r--r--fs/f2fs/recovery.c8
-rw-r--r--fs/f2fs/super.c28
-rw-r--r--fs/f2fs/xattr.c4
-rw-r--r--fs/f2fs/xattr.h2
-rw-r--r--fs/fat/inode.c25
-rw-r--r--fs/fat/misc.c6
-rw-r--r--fs/file.c153
-rw-r--r--fs/file_table.c49
-rw-r--r--fs/freevxfs/vxfs_inode.c6
-rw-r--r--fs/fs-writeback.c52
-rw-r--r--fs/fs_context.c34
-rw-r--r--fs/fsopen.c1
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dir.c10
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c29
-rw-r--r--fs/fuse/readdir.c6
-rw-r--r--fs/fuse/xattr.c2
-rw-r--r--fs/gfs2/bmap.c10
-rw-r--r--fs/gfs2/dir.c10
-rw-r--r--fs/gfs2/glock.c15
-rw-r--r--fs/gfs2/glops.c20
-rw-r--r--fs/gfs2/inode.c7
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/quota.h3
-rw-r--r--fs/gfs2/super.c12
-rw-r--r--fs/gfs2/super.h4
-rw-r--r--fs/gfs2/xattr.c4
-rw-r--r--fs/hfs/attr.c2
-rw-r--r--fs/hfs/catalog.c8
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c16
-rw-r--r--fs/hfs/sysdep.c10
-rw-r--r--fs/hfsplus/catalog.c8
-rw-r--r--fs/hfsplus/inode.c22
-rw-r--r--fs/hfsplus/xattr.c2
-rw-r--r--fs/hfsplus/xattr.h2
-rw-r--r--fs/hostfs/hostfs_kern.c12
-rw-r--r--fs/hpfs/dir.c12
-rw-r--r--fs/hpfs/inode.c16
-rw-r--r--fs/hpfs/namei.c22
-rw-r--r--fs/hpfs/super.c10
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/init.c6
-rw-r--r--fs/inode.c125
-rw-r--r--fs/internal.h22
-rw-r--r--fs/iomap/buffered-io.c42
-rw-r--r--fs/isofs/inode.c4
-rw-r--r--fs/isofs/rock.c18
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/jffs2/dir.c35
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c20
-rw-r--r--fs/jffs2/os-linux.h4
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jffs2/xattr.h2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_imap.c20
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/jfs_logmgr.c33
-rw-r--r--fs/jfs/jfs_logmgr.h2
-rw-r--r--fs/jfs/jfs_mount.c3
-rw-r--r--fs/jfs/jfs_xattr.h2
-rw-r--r--fs/jfs/namei.c20
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/xattr.c2
-rw-r--r--fs/kernfs/inode.c8
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/libfs.c42
-rw-r--r--fs/lockd/svc.c7
-rw-r--r--fs/lockd/svclock.c43
-rw-r--r--fs/locks.c12
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/dir.c6
-rw-r--r--fs/minix/inode.c17
-rw-r--r--fs/minix/itree_common.c2
-rw-r--r--fs/namei.c40
-rw-r--r--fs/namespace.c40
-rw-r--r--fs/netfs/buffered_read.c6
-rw-r--r--fs/nfs/blocklayout/blocklayout.h2
-rw-r--r--fs/nfs/blocklayout/dev.c76
-rw-r--r--fs/nfs/callback.c46
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/direct.c134
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c18
-rw-r--r--fs/nfs/fscache.h4
-rw-r--r--fs/nfs/inode.c30
-rw-r--r--fs/nfs/nfs.h2
-rw-r--r--fs/nfs/nfs42proc.c3
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4client.c6
-rw-r--r--fs/nfs/nfs4proc.c14
-rw-r--r--fs/nfs/nfs4state.c45
-rw-r--r--fs/nfs/pnfs.c33
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/write.c27
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/blocklayout.c3
-rw-r--r--fs/nfsd/blocklayoutxdr.c6
-rw-r--r--fs/nfsd/blocklayoutxdr.h4
-rw-r--r--fs/nfsd/export.c32
-rw-r--r--fs/nfsd/export.h4
-rw-r--r--fs/nfsd/filecache.c27
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c6
-rw-r--r--fs/nfsd/flexfilelayoutxdr.h4
-rw-r--r--fs/nfsd/netlink.c32
-rw-r--r--fs/nfsd/netlink.h22
-rw-r--r--fs/nfsd/nfs3proc.c9
-rw-r--r--fs/nfsd/nfs4callback.c97
-rw-r--r--fs/nfsd/nfs4layouts.c6
-rw-r--r--fs/nfsd/nfs4proc.c44
-rw-r--r--fs/nfsd/nfs4state.c156
-rw-r--r--fs/nfsd/nfs4xdr.c2640
-rw-r--r--fs/nfsd/nfsctl.c205
-rw-r--r--fs/nfsd/nfsd.h17
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfsfh.h3
-rw-r--r--fs/nfsd/nfssvc.c47
-rw-r--r--fs/nfsd/pnfs.h6
-rw-r--r--fs/nfsd/state.h27
-rw-r--r--fs/nfsd/stats.c4
-rw-r--r--fs/nfsd/stats.h18
-rw-r--r--fs/nfsd/trace.h87
-rw-r--r--fs/nfsd/vfs.c75
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nfsd/xdr4.h154
-rw-r--r--fs/nfsd/xdr4cb.h18
-rw-r--r--fs/nilfs2/dir.c6
-rw-r--r--fs/nilfs2/gcinode.c6
-rw-r--r--fs/nilfs2/inode.c20
-rw-r--r--fs/notify/dnotify/dnotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c25
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/inode.c25
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/ntfs3/attrib.c12
-rw-r--r--fs/ntfs3/attrlist.c15
-rw-r--r--fs/ntfs3/bitmap.c4
-rw-r--r--fs/ntfs3/dir.c6
-rw-r--r--fs/ntfs3/file.c10
-rw-r--r--fs/ntfs3/frecord.c19
-rw-r--r--fs/ntfs3/fslog.c6
-rw-r--r--fs/ntfs3/fsntfs.c19
-rw-r--r--fs/ntfs3/index.c3
-rw-r--r--fs/ntfs3/inode.c24
-rw-r--r--fs/ntfs3/namei.c6
-rw-r--r--fs/ntfs3/ntfs.h2
-rw-r--r--fs/ntfs3/ntfs_fs.h6
-rw-r--r--fs/ntfs3/record.c74
-rw-r--r--fs/ntfs3/super.c105
-rw-r--r--fs/ntfs3/xattr.c9
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c81
-rw-r--r--fs/ocfs2/dir.c9
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmglue.c29
-rw-r--r--fs/ocfs2/file.c30
-rw-r--r--fs/ocfs2/inode.c28
-rw-r--r--fs/ocfs2/move_extents.c4
-rw-r--r--fs/ocfs2/namei.c16
-rw-r--r--fs/ocfs2/refcounttree.c12
-rw-r--r--fs/ocfs2/xattr.c8
-rw-r--r--fs/ocfs2/xattr.h2
-rw-r--r--fs/omfs/inode.c12
-rw-r--r--fs/open.c52
-rw-r--r--fs/openpromfs/inode.c4
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/orangefs/orangefs-utils.c16
-rw-r--r--fs/orangefs/xattr.c2
-rw-r--r--fs/overlayfs/copy_up.c5
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c26
-rw-r--r--fs/overlayfs/inode.c3
-rw-r--r--fs/overlayfs/ovl_entry.h10
-rw-r--r--fs/overlayfs/params.c126
-rw-r--r--fs/overlayfs/super.c55
-rw-r--r--fs/overlayfs/util.c4
-rw-r--r--fs/pipe.c67
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/fd.c11
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/proc/task_nommu.c66
-rw-r--r--fs/proc/thread_self.c2
-rw-r--r--fs/pstore/inode.c5
-rw-r--r--fs/qnx4/inode.c6
-rw-r--r--fs/qnx6/inode.c6
-rw-r--r--fs/quota/dquot.c66
-rw-r--r--fs/ramfs/inode.c7
-rw-r--r--fs/reiserfs/inode.c26
-rw-r--r--fs/reiserfs/journal.c56
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h19
-rw-r--r--fs/reiserfs/stree.c5
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/romfs/super.c5
-rw-r--r--fs/smb/client/cached_dir.c147
-rw-r--r--fs/smb/client/cached_dir.h2
-rw-r--r--fs/smb/client/cifsfs.h2
-rw-r--r--fs/smb/client/cifsglob.h3
-rw-r--r--fs/smb/client/cifsproto.h2
-rw-r--r--fs/smb/client/connect.c28
-rw-r--r--fs/smb/client/file.c18
-rw-r--r--fs/smb/client/fs_context.c1
-rw-r--r--fs/smb/client/fscache.h6
-rw-r--r--fs/smb/client/inode.c19
-rw-r--r--fs/smb/client/misc.c14
-rw-r--r--fs/smb/client/smb2inode.c3
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2ops.c12
-rw-r--r--fs/smb/client/smb2pdu.c35
-rw-r--r--fs/smb/client/smbdirect.c9
-rw-r--r--fs/smb/client/trace.h2
-rw-r--r--fs/smb/client/transport.c36
-rw-r--r--fs/smb/client/xattr.c2
-rw-r--r--fs/smb/server/connection.c5
-rw-r--r--fs/smb/server/connection.h1
-rw-r--r--fs/smb/server/mgmt/tree_connect.c42
-rw-r--r--fs/smb/server/mgmt/tree_connect.h11
-rw-r--r--fs/smb/server/mgmt/user_session.c11
-rw-r--r--fs/smb/server/mgmt/user_session.h1
-rw-r--r--fs/smb/server/server.c6
-rw-r--r--fs/smb/server/smb2misc.c4
-rw-r--r--fs/smb/server/smb2pdu.c115
-rw-r--r--fs/smb/server/smbacl.c1
-rw-r--r--fs/smb/server/vfs_cache.c30
-rw-r--r--fs/smb/server/vfs_cache.h9
-rw-r--r--fs/squashfs/inode.c6
-rw-r--r--fs/squashfs/squashfs.h2
-rw-r--r--fs/squashfs/xattr.c2
-rw-r--r--fs/stack.c4
-rw-r--r--fs/stat.c49
-rw-r--r--fs/super.c66
-rw-r--r--fs/sysv/dir.c6
-rw-r--r--fs/sysv/ialloc.c2
-rw-r--r--fs/sysv/inode.c12
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/tracefs/event_inode.c146
-rw-r--r--fs/tracefs/inode.c7
-rw-r--r--fs/tracefs/internal.h5
-rw-r--r--fs/ubifs/crypto.c3
-rw-r--r--fs/ubifs/debug.c12
-rw-r--r--fs/ubifs/dir.c23
-rw-r--r--fs/ubifs/file.c16
-rw-r--r--fs/ubifs/journal.c12
-rw-r--r--fs/ubifs/super.c8
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/ialloc.c4
-rw-r--r--fs/udf/inode.c38
-rw-r--r--fs/udf/namei.c16
-rw-r--r--fs/ufs/dir.c6
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c42
-rw-r--r--fs/vboxsf/utils.c15
-rw-r--r--fs/xattr.c6
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/libxfs/xfs_ag.c6
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c10
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h22
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_sb.c3
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c8
-rw-r--r--fs/xfs/scrub/scrub.c4
-rw-r--r--fs/xfs/scrub/stats.c5
-rw-r--r--fs/xfs/scrub/xfile.c1
-rw-r--r--fs/xfs/xfs_attr_inactive.c1
-rw-r--r--fs/xfs/xfs_attr_item.c7
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c7
-rw-r--r--fs/xfs/xfs_buf.c22
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_discard.c266
-rw-r--r--fs/xfs/xfs_discard.h6
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_extent_busy.c35
-rw-r--r--fs/xfs/xfs_extent_busy.h24
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_fsmap.c25
-rw-r--r--fs/xfs/xfs_icache.c80
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c235
-rw-r--r--fs/xfs/xfs_inode.h34
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_iops.c17
-rw-r--r--fs/xfs/xfs_itable.c23
-rw-r--r--fs/xfs/xfs_log.c17
-rw-r--r--fs/xfs/xfs_log_cil.c145
-rw-r--r--fs/xfs/xfs_log_priv.h19
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_mount.h17
-rw-r--r--fs/xfs/xfs_notify_failure.c6
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_refcount_item.c6
-rw-r--r--fs/xfs/xfs_rmap_item.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c30
-rw-r--r--fs/xfs/xfs_super.c130
-rw-r--r--fs/xfs/xfs_trace.h45
-rw-r--r--fs/xfs/xfs_xattr.c13
-rw-r--r--fs/xfs/xfs_xattr.h2
-rw-r--r--fs/zonefs/super.c10
490 files changed, 11219 insertions, 9931 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0d28ecf668d0..b845ee18a80b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mapping->a_ops = &v9fs_addr_operations;
inode->i_private = NULL;
@@ -1150,8 +1150,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
set_nlink(inode, 1);
- inode->i_atime.tv_sec = stat->atime;
- inode->i_mtime.tv_sec = stat->mtime;
+ inode_set_atime(inode, stat->atime, 0);
+ inode_set_mtime(inode, stat->mtime, 0);
inode_set_ctime(inode, stat->mtime, 0);
inode->i_uid = v9ses->dfltuid;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1312f68965ac..c7319af2f471 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -641,10 +641,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
struct v9fs_inode *v9inode = V9FS_I(inode);
if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
- inode->i_atime.tv_sec = stat->st_atime_sec;
- inode->i_atime.tv_nsec = stat->st_atime_nsec;
- inode->i_mtime.tv_sec = stat->st_mtime_sec;
- inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+ inode_set_atime(inode, stat->st_atime_sec,
+ stat->st_atime_nsec);
+ inode_set_mtime(inode, stat->st_mtime_sec,
+ stat->st_mtime_nsec);
inode_set_ctime(inode, stat->st_ctime_sec,
stat->st_ctime_nsec);
inode->i_uid = stat->st_uid;
@@ -660,12 +660,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_blocks = stat->st_blocks;
} else {
if (stat->st_result_mask & P9_STATS_ATIME) {
- inode->i_atime.tv_sec = stat->st_atime_sec;
- inode->i_atime.tv_nsec = stat->st_atime_nsec;
+ inode_set_atime(inode, stat->st_atime_sec,
+ stat->st_atime_nsec);
}
if (stat->st_result_mask & P9_STATS_MTIME) {
- inode->i_mtime.tv_sec = stat->st_mtime_sec;
- inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
+ inode_set_mtime(inode, stat->st_mtime_sec,
+ stat->st_mtime_nsec);
}
if (stat->st_result_mask & P9_STATS_CTIME) {
inode_set_ctime(inode, stat->st_ctime_sec,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index e00cf8109b3f..053d1cef6e13 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -162,27 +162,27 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
return v9fs_xattr_set(dentry, full_name, value, size, flags);
}
-static struct xattr_handler v9fs_xattr_user_handler = {
+static const struct xattr_handler v9fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.get = v9fs_xattr_handler_get,
.set = v9fs_xattr_handler_set,
};
-static struct xattr_handler v9fs_xattr_trusted_handler = {
+static const struct xattr_handler v9fs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = v9fs_xattr_handler_get,
.set = v9fs_xattr_handler_set,
};
#ifdef CONFIG_9P_FS_SECURITY
-static struct xattr_handler v9fs_xattr_security_handler = {
+static const struct xattr_handler v9fs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = v9fs_xattr_handler_get,
.set = v9fs_xattr_handler_set,
};
#endif
-const struct xattr_handler *v9fs_xattr_handlers[] = {
+const struct xattr_handler * const v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
#ifdef CONFIG_9P_FS_SECURITY
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index b5636e544c8a..3ad5a802352a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -10,7 +10,7 @@
#include <net/9p/9p.h>
#include <net/9p/client.h>
-extern const struct xattr_handler *v9fs_xattr_handlers[];
+extern const struct xattr_handler * const v9fs_xattr_handlers[];
ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
void *buffer, size_t buffer_size);
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 20963002578a..3081edb09e46 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -242,6 +242,7 @@ struct inode *
adfs_iget(struct super_block *sb, struct object_info *obj)
{
struct inode *inode;
+ struct timespec64 ts;
inode = new_inode(sb);
if (!inode)
@@ -268,9 +269,10 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
ADFS_I(inode)->attr = obj->attr;
inode->i_mode = adfs_atts2mode(sb, inode);
- adfs_adfs2unix_time(&inode->i_mtime, inode);
- inode->i_atime = inode->i_mtime;
- inode_set_ctime_to_ts(inode, inode->i_mtime);
+ adfs_adfs2unix_time(&ts, inode);
+ inode_set_atime_to_ts(inode, ts);
+ inode_set_mtime_to_ts(inode, ts);
+ inode_set_ctime_to_ts(inode, ts);
if (S_ISDIR(inode->i_mode)) {
inode->i_op = &adfs_dir_inode_operations;
@@ -321,7 +323,8 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) {
adfs_unix2adfs_time(inode, &attr->ia_mtime);
- adfs_adfs2unix_time(&inode->i_mtime, inode);
+ adfs_adfs2unix_time(&attr->ia_mtime, inode);
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
}
/*
@@ -329,7 +332,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
* have the ability to represent them in our filesystem?
*/
if (ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 7ba93efc1143..fd669daa4e7b 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
mark_buffer_dirty_inode(dir_bh, dir);
affs_brelse(dir_bh);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
mark_inode_dirty(dir);
@@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
affs_brelse(bh);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
mark_inode_dirty(dir);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 060746c63151..0210df8d3500 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -149,13 +149,9 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
break;
}
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec =
- inode_set_ctime(inode,
- (be32_to_cpu(tail->change.days) * 86400LL +
- be32_to_cpu(tail->change.mins) * 60 +
- be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA)
- + sys_tz.tz_minuteswest * 60, 0).tv_sec;
- inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode,
+ inode_set_atime(inode, inode_set_ctime(inode, (be32_to_cpu(tail->change.days) * 86400LL + be32_to_cpu(tail->change.mins) * 60 + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + sys_tz.tz_minuteswest * 60, 0).tv_sec, 0).tv_sec,
+ 0);
affs_brelse(bh);
unlock_new_inode(inode);
return inode;
@@ -187,12 +183,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
}
tail = AFFS_TAIL(sb, bh);
if (tail->stype == cpu_to_be32(ST_ROOT)) {
- affs_secs_to_datestamp(inode->i_mtime.tv_sec,
+ affs_secs_to_datestamp(inode_get_mtime_sec(inode),
&AFFS_ROOT_TAIL(sb, bh)->root_change);
} else {
tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect);
tail->size = cpu_to_be32(inode->i_size);
- affs_secs_to_datestamp(inode->i_mtime.tv_sec, &tail->change);
+ affs_secs_to_datestamp(inode_get_mtime_sec(inode),
+ &tail->change);
if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
uid = i_uid_read(inode);
gid = i_gid_read(inode);
@@ -314,7 +311,7 @@ affs_new_inode(struct inode *dir)
inode->i_gid = current_fsgid();
inode->i_ino = block;
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
atomic_set(&AFFS_I(inode)->i_opencnt, 0);
AFFS_I(inode)->i_blkcnt = 0;
AFFS_I(inode)->i_lc = NULL;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 95bcbd7654d1..4d04ef2d3ae7 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
set_nlink(inode, 2);
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
inode->i_generation = 0;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1c794a1896aa..78efc9719349 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -91,8 +91,8 @@ static int afs_inode_init_from_status(struct afs_operation *op,
t = status->mtime_client;
inode_set_ctime_to_ts(inode, t);
- inode->i_mtime = t;
- inode->i_atime = t;
+ inode_set_mtime_to_ts(inode, t);
+ inode_set_atime_to_ts(inode, t);
inode->i_flags |= S_NOATIME;
inode->i_uid = make_kuid(&init_user_ns, status->owner);
inode->i_gid = make_kgid(&init_user_ns, status->group);
@@ -204,7 +204,7 @@ static void afs_apply_status(struct afs_operation *op,
}
t = status->mtime_client;
- inode->i_mtime = t;
+ inode_set_mtime_to_ts(inode, t);
if (vp->update_ctime)
inode_set_ctime_to_ts(inode, op->ctime);
@@ -253,7 +253,7 @@ static void afs_apply_status(struct afs_operation *op,
if (change_size) {
afs_set_i_size(vnode, status->size);
inode_set_ctime_to_ts(inode, t);
- inode->i_atime = t;
+ inode_set_atime_to_ts(inode, t);
}
}
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index da73b97e19a9..23e2cc4efe41 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1541,7 +1541,7 @@ int afs_launder_folio(struct folio *);
/*
* xattr.c
*/
-extern const struct xattr_handler *afs_xattr_handlers[];
+extern const struct xattr_handler * const afs_xattr_handlers[];
/*
* yfsclient.c
diff --git a/fs/afs/write.c b/fs/afs/write.c
index e1c45341719b..4a168781936b 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -424,7 +424,7 @@ try_next_key:
op->store.write_iter = iter;
op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
- op->mtime = vnode->netfs.inode.i_mtime;
+ op->mtime = inode_get_mtime(&vnode->netfs.inode);
afs_wait_for_operation(op);
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 9048d8ccc715..64b2c0224f62 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -353,7 +353,7 @@ static const struct xattr_handler afs_xattr_afs_volume_handler = {
.get = afs_xattr_get_volume,
};
-const struct xattr_handler *afs_xattr_handlers[] = {
+const struct xattr_handler * const afs_xattr_handlers[] = {
&afs_xattr_afs_acl_handler,
&afs_xattr_afs_cell_handler,
&afs_xattr_afs_fid_handler,
diff --git a/fs/aio.c b/fs/aio.c
index a4c2a6bac72c..f8589caef9c1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -80,7 +80,7 @@ struct aio_ring {
struct kioctx_table {
struct rcu_head rcu;
unsigned nr;
- struct kioctx __rcu *table[];
+ struct kioctx __rcu *table[] __counted_by(nr);
};
struct kioctx_cpu {
diff --git a/fs/attr.c b/fs/attr.c
index a8ae5f6d9b16..bdf5deb06ea9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -308,9 +308,9 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
if (ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index d5a44fa88acf..8c1d587b3eef 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -25,6 +25,8 @@
#include <linux/completion.h>
#include <linux/file.h>
#include <linux/magic.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
@@ -205,20 +207,34 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry)
/* Initializing function */
-int autofs_fill_super(struct super_block *, void *, int);
+extern const struct fs_parameter_spec autofs_param_specs[];
+int autofs_init_fs_context(struct fs_context *fc);
struct autofs_info *autofs_new_ino(struct autofs_sb_info *);
void autofs_clean_ino(struct autofs_info *);
-static inline int autofs_prepare_pipe(struct file *pipe)
+static inline int autofs_check_pipe(struct file *pipe)
{
if (!(pipe->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (!S_ISFIFO(file_inode(pipe)->i_mode))
return -EINVAL;
+ return 0;
+}
+
+static inline void autofs_set_packet_pipe_flags(struct file *pipe)
+{
/* We want a packet pipe */
pipe->f_flags |= O_DIRECT;
/* We don't expect -EAGAIN */
pipe->f_flags &= ~O_NONBLOCK;
+}
+
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+ int ret = autofs_check_pipe(pipe);
+ if (ret < 0)
+ return ret;
+ autofs_set_packet_pipe_flags(pipe);
return 0;
}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index d3f55e874338..b5e4dfa04ed0 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -7,16 +7,11 @@
#include <linux/init.h>
#include "autofs_i.h"
-static struct dentry *autofs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_nodev(fs_type, flags, data, autofs_fill_super);
-}
-
struct file_system_type autofs_fs_type = {
.owner = THIS_MODULE,
.name = "autofs",
- .mount = autofs_mount,
+ .init_fs_context = autofs_init_fs_context,
+ .parameters = autofs_param_specs,
.kill_sb = autofs_kill_sb,
};
MODULE_ALIAS_FS("autofs");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 2b49662ed237..a5083d447a62 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -6,7 +6,6 @@
#include <linux/seq_file.h>
#include <linux/pagemap.h>
-#include <linux/parser.h>
#include "autofs_i.h"
@@ -110,189 +109,179 @@ static const struct super_operations autofs_sops = {
.evict_inode = autofs_evict_inode,
};
-enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
- Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire,
- Opt_ignore};
-
-static const match_table_t tokens = {
- {Opt_fd, "fd=%u"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_pgrp, "pgrp=%u"},
- {Opt_minproto, "minproto=%u"},
- {Opt_maxproto, "maxproto=%u"},
- {Opt_indirect, "indirect"},
- {Opt_direct, "direct"},
- {Opt_offset, "offset"},
- {Opt_strictexpire, "strictexpire"},
- {Opt_ignore, "ignore"},
- {Opt_err, NULL}
+enum {
+ Opt_direct,
+ Opt_fd,
+ Opt_gid,
+ Opt_ignore,
+ Opt_indirect,
+ Opt_maxproto,
+ Opt_minproto,
+ Opt_offset,
+ Opt_pgrp,
+ Opt_strictexpire,
+ Opt_uid,
};
-static int parse_options(char *options,
- struct inode *root, int *pgrp, bool *pgrp_set,
- struct autofs_sb_info *sbi)
+const struct fs_parameter_spec autofs_param_specs[] = {
+ fsparam_flag ("direct", Opt_direct),
+ fsparam_fd ("fd", Opt_fd),
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_flag ("ignore", Opt_ignore),
+ fsparam_flag ("indirect", Opt_indirect),
+ fsparam_u32 ("maxproto", Opt_maxproto),
+ fsparam_u32 ("minproto", Opt_minproto),
+ fsparam_flag ("offset", Opt_offset),
+ fsparam_u32 ("pgrp", Opt_pgrp),
+ fsparam_flag ("strictexpire", Opt_strictexpire),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
+};
+
+struct autofs_fs_context {
+ kuid_t uid;
+ kgid_t gid;
+ int pgrp;
+ bool pgrp_set;
+};
+
+/*
+ * Open the fd. We do it here rather than in get_tree so that it's done in the
+ * context of the system call that passed the data and not the one that
+ * triggered the superblock creation, lest the fd gets reassigned.
+ */
+static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- int pipefd = -1;
- kuid_t uid;
- kgid_t gid;
+ struct file *pipe;
+ int ret;
- root->i_uid = current_uid();
- root->i_gid = current_gid();
+ if (param->type == fs_value_is_file) {
+ /* came through the new api */
+ pipe = param->file;
+ param->file = NULL;
+ } else {
+ pipe = fget(result->uint_32);
+ }
+ if (!pipe) {
+ errorf(fc, "could not open pipe file descriptor");
+ return -EBADF;
+ }
- sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
- sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+ ret = autofs_check_pipe(pipe);
+ if (ret < 0) {
+ errorf(fc, "Invalid/unusable pipe");
+ if (param->type != fs_value_is_file)
+ fput(pipe);
+ return -EBADF;
+ }
- sbi->pipefd = -1;
+ autofs_set_packet_pipe_flags(pipe);
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_fd:
- if (match_int(args, &pipefd))
- return 1;
- sbi->pipefd = pipefd;
- break;
- case Opt_uid:
- if (match_int(args, &option))
- return 1;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid))
- return 1;
- root->i_uid = uid;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return 1;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid))
- return 1;
- root->i_gid = gid;
- break;
- case Opt_pgrp:
- if (match_int(args, &option))
- return 1;
- *pgrp = option;
- *pgrp_set = true;
- break;
- case Opt_minproto:
- if (match_int(args, &option))
- return 1;
- sbi->min_proto = option;
- break;
- case Opt_maxproto:
- if (match_int(args, &option))
- return 1;
- sbi->max_proto = option;
- break;
- case Opt_indirect:
- set_autofs_type_indirect(&sbi->type);
- break;
- case Opt_direct:
- set_autofs_type_direct(&sbi->type);
- break;
- case Opt_offset:
- set_autofs_type_offset(&sbi->type);
- break;
- case Opt_strictexpire:
- sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
- break;
- case Opt_ignore:
- sbi->flags |= AUTOFS_SBI_IGNORE;
- break;
- default:
- return 1;
- }
+ if (sbi->pipe)
+ fput(sbi->pipe);
+
+ sbi->pipefd = result->uint_32;
+ sbi->pipe = pipe;
+
+ return 0;
+}
+
+static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct autofs_fs_context *ctx = fc->fs_private;
+ struct autofs_sb_info *sbi = fc->s_fs_info;
+ struct fs_parse_result result;
+ kuid_t uid;
+ kgid_t gid;
+ int opt;
+
+ opt = fs_parse(fc, autofs_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_fd:
+ return autofs_parse_fd(fc, sbi, param, &result);
+ case Opt_uid:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ return invalfc(fc, "Invalid uid");
+ ctx->uid = uid;
+ break;
+ case Opt_gid:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ return invalfc(fc, "Invalid gid");
+ ctx->gid = gid;
+ break;
+ case Opt_pgrp:
+ ctx->pgrp = result.uint_32;
+ ctx->pgrp_set = true;
+ break;
+ case Opt_minproto:
+ sbi->min_proto = result.uint_32;
+ break;
+ case Opt_maxproto:
+ sbi->max_proto = result.uint_32;
+ break;
+ case Opt_indirect:
+ set_autofs_type_indirect(&sbi->type);
+ break;
+ case Opt_direct:
+ set_autofs_type_direct(&sbi->type);
+ break;
+ case Opt_offset:
+ set_autofs_type_offset(&sbi->type);
+ break;
+ case Opt_strictexpire:
+ sbi->flags |= AUTOFS_SBI_STRICTEXPIRE;
+ break;
+ case Opt_ignore:
+ sbi->flags |= AUTOFS_SBI_IGNORE;
}
- return (sbi->pipefd < 0);
+
+ return 0;
}
-int autofs_fill_super(struct super_block *s, void *data, int silent)
+static struct autofs_sb_info *autofs_alloc_sbi(void)
{
- struct inode *root_inode;
- struct dentry *root;
- struct file *pipe;
struct autofs_sb_info *sbi;
- struct autofs_info *ino;
- int pgrp = 0;
- bool pgrp_set = false;
- int ret = -EINVAL;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
- return -ENOMEM;
- pr_debug("starting up, sbi = %p\n", sbi);
+ return NULL;
- s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
- sbi->pipefd = -1;
- sbi->pipe = NULL;
- sbi->exp_timeout = 0;
- sbi->oz_pgrp = NULL;
- sbi->sb = s;
- sbi->version = 0;
- sbi->sub_version = 0;
sbi->flags = AUTOFS_SBI_CATATONIC;
+ sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
+ sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
+ sbi->pipefd = -1;
+
set_autofs_type_indirect(&sbi->type);
- sbi->min_proto = 0;
- sbi->max_proto = 0;
mutex_init(&sbi->wq_mutex);
mutex_init(&sbi->pipe_mutex);
spin_lock_init(&sbi->fs_lock);
- sbi->queues = NULL;
spin_lock_init(&sbi->lookup_lock);
INIT_LIST_HEAD(&sbi->active_list);
INIT_LIST_HEAD(&sbi->expiring_list);
- s->s_blocksize = 1024;
- s->s_blocksize_bits = 10;
- s->s_magic = AUTOFS_SUPER_MAGIC;
- s->s_op = &autofs_sops;
- s->s_d_op = &autofs_dentry_operations;
- s->s_time_gran = 1;
- /*
- * Get the root inode and dentry, but defer checking for errors.
- */
- ino = autofs_new_ino(sbi);
- if (!ino) {
- ret = -ENOMEM;
- goto fail_free;
- }
- root_inode = autofs_get_inode(s, S_IFDIR | 0755);
- root = d_make_root(root_inode);
- if (!root) {
- ret = -ENOMEM;
- goto fail_ino;
- }
- pipe = NULL;
-
- root->d_fsdata = ino;
+ return sbi;
+}
- /* Can this call block? */
- if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) {
- pr_err("called with bogus options\n");
- goto fail_dput;
- }
+static int autofs_validate_protocol(struct fs_context *fc)
+{
+ struct autofs_sb_info *sbi = fc->s_fs_info;
/* Test versions first */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- pr_err("kernel does not match daemon version "
+ errorf(fc, "kernel does not match daemon version "
"daemon (%d, %d) kernel (%d, %d)\n",
sbi->min_proto, sbi->max_proto,
AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
- goto fail_dput;
+ return -EINVAL;
}
/* Establish highest kernel protocol version */
@@ -300,13 +289,62 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
sbi->version = AUTOFS_MAX_PROTO_VERSION;
else
sbi->version = sbi->max_proto;
- sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- if (pgrp_set) {
- sbi->oz_pgrp = find_get_pid(pgrp);
+ switch (sbi->version) {
+ case 4:
+ sbi->sub_version = 7;
+ break;
+ case 5:
+ sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
+ break;
+ default:
+ sbi->sub_version = 0;
+ }
+
+ return 0;
+}
+
+static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+ struct autofs_fs_context *ctx = fc->fs_private;
+ struct autofs_sb_info *sbi = s->s_fs_info;
+ struct inode *root_inode;
+ struct dentry *root;
+ struct autofs_info *ino;
+ int ret = -ENOMEM;
+
+ pr_debug("starting up, sbi = %p\n", sbi);
+
+ sbi->sb = s;
+ s->s_blocksize = 1024;
+ s->s_blocksize_bits = 10;
+ s->s_magic = AUTOFS_SUPER_MAGIC;
+ s->s_op = &autofs_sops;
+ s->s_d_op = &autofs_dentry_operations;
+ s->s_time_gran = 1;
+
+ /*
+ * Get the root inode and dentry, but defer checking for errors.
+ */
+ ino = autofs_new_ino(sbi);
+ if (!ino)
+ goto fail;
+
+ root_inode = autofs_get_inode(s, S_IFDIR | 0755);
+ root_inode->i_uid = ctx->uid;
+ root_inode->i_gid = ctx->gid;
+
+ root = d_make_root(root_inode);
+ if (!root)
+ goto fail_ino;
+
+ root->d_fsdata = ino;
+
+ if (ctx->pgrp_set) {
+ sbi->oz_pgrp = find_get_pid(ctx->pgrp);
if (!sbi->oz_pgrp) {
- pr_err("could not find process group %d\n",
- pgrp);
+ ret = invalf(fc, "Could not find process group %d",
+ ctx->pgrp);
goto fail_dput;
}
} else {
@@ -321,16 +359,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
pr_debug("pipe fd = %d, pgrp = %u\n",
sbi->pipefd, pid_nr(sbi->oz_pgrp));
- pipe = fget(sbi->pipefd);
- if (!pipe) {
- pr_err("could not open pipe file descriptor\n");
- goto fail_put_pid;
- }
- ret = autofs_prepare_pipe(pipe);
- if (ret < 0)
- goto fail_fput;
- sbi->pipe = pipe;
sbi->flags &= ~AUTOFS_SBI_CATATONIC;
/*
@@ -342,22 +371,82 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
/*
* Failure ... clean up.
*/
-fail_fput:
- pr_err("pipe file descriptor does not contain proper ops\n");
- fput(pipe);
-fail_put_pid:
- put_pid(sbi->oz_pgrp);
fail_dput:
dput(root);
- goto fail_free;
+ goto fail;
fail_ino:
autofs_free_ino(ino);
-fail_free:
- kfree(sbi);
- s->s_fs_info = NULL;
+fail:
return ret;
}
+/*
+ * Validate the parameters and then request a superblock.
+ */
+static int autofs_get_tree(struct fs_context *fc)
+{
+ struct autofs_sb_info *sbi = fc->s_fs_info;
+ int ret;
+
+ ret = autofs_validate_protocol(fc);
+ if (ret)
+ return ret;
+
+ if (sbi->pipefd < 0)
+ return invalf(fc, "No control pipe specified");
+
+ return get_tree_nodev(fc, autofs_fill_super);
+}
+
+static void autofs_free_fc(struct fs_context *fc)
+{
+ struct autofs_fs_context *ctx = fc->fs_private;
+ struct autofs_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi) {
+ if (sbi->pipe)
+ fput(sbi->pipe);
+ kfree(sbi);
+ }
+ kfree(ctx);
+}
+
+static const struct fs_context_operations autofs_context_ops = {
+ .free = autofs_free_fc,
+ .parse_param = autofs_parse_param,
+ .get_tree = autofs_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+int autofs_init_fs_context(struct fs_context *fc)
+{
+ struct autofs_fs_context *ctx;
+ struct autofs_sb_info *sbi;
+
+ ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ goto nomem;
+
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+
+ sbi = autofs_alloc_sbi();
+ if (!sbi)
+ goto nomem_ctx;
+
+ fc->fs_private = ctx;
+ fc->s_fs_info = sbi;
+ fc->ops = &autofs_context_ops;
+ return 0;
+
+nomem_ctx:
+ kfree(ctx);
+nomem:
+ return -ENOMEM;
+}
+
struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
{
struct inode *inode = new_inode(sb);
@@ -370,7 +459,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
inode->i_uid = d_inode(sb->s_root)->i_uid;
inode->i_gid = d_inode(sb->s_root)->i_gid;
}
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_ino = get_next_ino();
if (S_ISDIR(mode)) {
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 512b9a26c63d..530d18827e35 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
return 0;
}
@@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
spin_lock(&sbi->lookup_lock);
__autofs_add_expiring(dentry);
@@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
return 0;
}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 83f9566c973b..316d88da2ce1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -208,7 +208,7 @@ void make_bad_inode(struct inode *inode)
remove_inode_hash(inode);
inode->i_mode = S_IFREG;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &bad_inode_ops;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &bad_file_ops;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index bc009ef497d0..6642b88c41a0 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -66,9 +66,9 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
inode->v.i_mode = bi->bi_mode;
if (fields & ATTR_ATIME)
- inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+ inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
if (fields & ATTR_MTIME)
- inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+ inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
if (fields & ATTR_CTIME)
inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
@@ -753,8 +753,8 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->gid = inode->v.i_gid;
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
- stat->atime = inode->v.i_atime;
- stat->mtime = inode->v.i_mtime;
+ stat->atime = inode_get_atime(&inode->v);
+ stat->mtime = inode_get_mtime(&inode->v);
stat->ctime = inode_get_ctime(&inode->v);
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
@@ -1418,8 +1418,8 @@ static int inode_update_times_fn(struct btree_trans *trans,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime);
- bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime);
+ bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+ bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
return 0;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9a16a51fbb88..9acdec56f626 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -360,11 +360,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
* for indexing purposes. (PFD, page 54)
*/
- inode->i_mtime.tv_sec =
- fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
- inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */
- inode_set_ctime_to_ts(inode, inode->i_mtime);
- inode->i_atime = inode->i_mtime;
+ inode_set_mtime(inode,
+ fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16,
+ 0);/* lower 16 bits are not a time */
+ inode_set_ctime_to_ts(inode, inode_get_mtime(inode));
+ inode_set_atime_to_ts(inode, inode_get_mtime(inode));
befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 12b8af04dcb3..fbc4ae80a4b2 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir,
set_bit(ino, info->si_imap);
info->si_freei--;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
@@ -187,7 +187,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
}
de->ino = 0;
mark_buffer_dirty_inode(bh, dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
@@ -240,7 +240,7 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto end_rename;
}
old_de->ino = 0;
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
mark_inode_dirty(old_dir);
if (new_inode) {
inode_set_ctime_current(new_inode);
@@ -294,7 +294,8 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
dir->i_size += BFS_DIRENT_SIZE;
inode_set_ctime_current(dir);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_current(dir));
mark_inode_dirty(dir);
de->ino = cpu_to_le16((u16)ino);
for (i = 0; i < BFS_NAMELEN; i++)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e6a76ae9eb44..355957dbce39 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -80,11 +80,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
set_nlink(inode, le32_to_cpu(di->i_nlink));
inode->i_size = BFS_FILESIZE(di);
inode->i_blocks = BFS_FILEBLOCKS(di);
- inode->i_atime.tv_sec = le32_to_cpu(di->i_atime);
- inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime);
+ inode_set_atime(inode, le32_to_cpu(di->i_atime), 0);
+ inode_set_mtime(inode, le32_to_cpu(di->i_mtime), 0);
inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
brelse(bh);
unlock_new_inode(inode);
@@ -140,9 +138,9 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_uid = cpu_to_le32(i_uid_read(inode));
di->i_gid = cpu_to_le32(i_gid_read(inode));
di->i_nlink = cpu_to_le32(inode->i_nlink);
- di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
+ di->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+ di->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
+ di->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
i_sblock = BFS_I(inode)->i_sblock;
di->i_sblock = cpu_to_le32(i_sblock);
di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 43b2a2851ba3..206812ce544a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -345,10 +345,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
+ SET_PERSONALITY(exec_params.hdr);
if (elf_check_fdpic(&exec_params.hdr))
- set_personality(PER_LINUX_FDPIC);
- else
- set_personality(PER_LINUX);
+ current->personality |= PER_LINUX_FDPIC;
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e0108d17b085..5d2be9b0a0a5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -547,7 +547,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 3282adc84d52..4fb925e8c981 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,7 +31,7 @@ config BTRFS_FS
continue to be mountable and usable by newer kernels.
For more information, please see the web pages at
- http://btrfs.wiki.kernel.org.
+ https://btrfs.readthedocs.io
To compile this file system support as a module, choose M here. The
module will be called btrfs.
@@ -48,27 +48,6 @@ config BTRFS_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N
-config BTRFS_FS_CHECK_INTEGRITY
- bool "Btrfs with integrity check tool compiled in (DEPRECATED)"
- depends on BTRFS_FS
- help
- This feature has been deprecated and will be removed in 6.7.
-
- Adds code that examines all block write requests (including
- writes of the super block). The goal is to verify that the
- state of the filesystem on disk is always consistent, i.e.,
- after a power-loss or kernel panic event the filesystem is
- in a consistent state.
-
- If the integrity check tool is included and activated in
- the mount options, plenty of kernel memory is used, and
- plenty of additional CPU cycles are spent. Enabling this
- functionality is not intended for normal use.
-
- In most cases, unless you are a btrfs developer who needs
- to verify the integrity of (super)-block write requests
- during the run of a regression test, say N
-
config BTRFS_FS_RUN_SANITY_TESTS
bool "Btrfs will run sanity tests upon loading"
depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 90d53209755b..525af975f61c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
- lru_cache.o
+ lru_cache.o raid-stripe-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
-btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 8cfc8214109c..aa0844535644 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -4,6 +4,7 @@
#define BTRFS_ACCESSORS_H
#include <linux/stddef.h>
+#include <asm/unaligned.h>
struct btrfs_map_token {
struct extent_buffer *eb;
@@ -305,6 +306,14 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
+ struct btrfs_stripe_extent, encoding, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
+
/* struct btrfs_dev_extent */
BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
@@ -349,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3
BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);
+BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref,
+ root_id, 64);
+
BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
type, 8);
BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
@@ -365,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
if (type == BTRFS_EXTENT_DATA_REF_KEY)
return sizeof(struct btrfs_extent_data_ref) +
offsetof(struct btrfs_extent_inline_ref, offset);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY)
+ return sizeof(struct btrfs_extent_inline_ref);
return 0;
}
@@ -966,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
flags, 64);
BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
rescan, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item,
+ enable_gen, 64);
/* btrfs_qgroup_info_item */
BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ce083e99ef68..9e261aac671e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
+#include <trace/events/btrfs.h>
#include "async-thread.h"
#include "ctree.h"
@@ -242,7 +243,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
break;
trace_btrfs_ordered_sched(work);
spin_unlock_irqrestore(lock, flags);
- work->ordered_func(work);
+ work->ordered_func(work, false);
/* now take the lock again and drop our item from the list */
spin_lock_irqsave(lock, flags);
@@ -277,7 +278,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
* We don't want to call the ordered free functions with
* the lock held.
*/
- work->ordered_free(work);
+ work->ordered_func(work, true);
/* NB: work must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, work);
}
@@ -285,7 +286,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
spin_unlock_irqrestore(lock, flags);
if (free_self) {
- self->ordered_free(self);
+ self->ordered_func(self, true);
/* NB: self must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, self);
}
@@ -300,7 +301,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
/*
* We should not touch things inside work in the following cases:
- * 1) after work->func() if it has no ordered_free
+ * 1) after work->func() if it has no ordered_func(..., true) to free
* Since the struct is freed in work->func().
* 2) after setting WORK_DONE_BIT
* The work may be freed in other threads almost instantly.
@@ -329,11 +330,10 @@ static void btrfs_work_helper(struct work_struct *normal_work)
}
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
- btrfs_func_t ordered_func, btrfs_func_t ordered_free)
+ btrfs_ordered_func_t ordered_func)
{
work->func = func;
work->ordered_func = ordered_func;
- work->ordered_free = ordered_free;
INIT_WORK(&work->normal_work, btrfs_work_helper);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 30f66c5e2e6e..62b8a0d57898 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -13,11 +13,11 @@ struct btrfs_fs_info;
struct btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
+typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool);
struct btrfs_work {
btrfs_func_t func;
- btrfs_func_t ordered_func;
- btrfs_func_t ordered_free;
+ btrfs_ordered_func_t ordered_func;
/* Don't touch things below */
struct work_struct normal_work;
@@ -35,7 +35,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags);
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
- btrfs_func_t ordered_func, btrfs_func_t ordered_free);
+ btrfs_ordered_func_t ordered_func);
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b7d54efb4728..beed7e459dab 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx,
count, sc, GFP_NOFS);
break;
}
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA));
+ break;
default:
WARN_ON(1);
}
@@ -2998,7 +3001,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter)
}
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_backref_cache *cache, int is_reloc)
+ struct btrfs_backref_cache *cache, bool is_reloc)
{
int i;
@@ -3196,12 +3199,14 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
* We still need to do a tree search to find out the parents. This is for
* TREE_BLOCK_REF backref (keyed or inlined).
*
+ * @trans: Transaction handle.
* @ref_key: The same as @ref_key in handle_direct_tree_backref()
* @tree_key: The first key of this tree block.
* @path: A clean (released) path, to avoid allocating path every time
* the function get called.
*/
-static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
+static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
+ struct btrfs_backref_cache *cache,
struct btrfs_path *path,
struct btrfs_key *ref_key,
struct btrfs_key *tree_key,
@@ -3315,7 +3320,7 @@ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
* If we know the block isn't shared we can avoid
* checking its backrefs.
*/
- if (btrfs_block_can_be_shared(root, eb))
+ if (btrfs_block_can_be_shared(trans, root, eb))
upper->checked = 0;
else
upper->checked = 1;
@@ -3363,11 +3368,13 @@ out:
* links aren't yet bi-directional. Needs to finish such links.
* Use btrfs_backref_finish_upper_links() to finish such linkage.
*
+ * @trans: Transaction handle.
* @path: Released path for indirect tree backref lookup
* @iter: Released backref iter for extent tree search
* @node_key: The first key of the tree block
*/
-int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_backref_cache *cache,
struct btrfs_path *path,
struct btrfs_backref_iter *iter,
struct btrfs_key *node_key,
@@ -3467,8 +3474,8 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
* offset means the root objectid. We need to search
* the tree to get its parent bytenr.
*/
- ret = handle_indirect_tree_backref(cache, path, &key, node_key,
- cur);
+ ret = handle_indirect_tree_backref(trans, cache, path,
+ &key, node_key, cur);
if (ret < 0)
goto out;
}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 1616e3e3f1e4..ab4ca0eda605 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -247,7 +247,7 @@ struct prelim_ref {
struct rb_node rbnode;
u64 root_id;
struct btrfs_key key_for_search;
- int level;
+ u8 level;
int count;
struct extent_inode_elem *inode_list;
u64 parent;
@@ -440,11 +440,11 @@ struct btrfs_backref_cache {
* Reloction backref cache require more info for reloc root compared
* to generic backref cache.
*/
- unsigned int is_reloc;
+ bool is_reloc;
};
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_backref_cache *cache, int is_reloc);
+ struct btrfs_backref_cache *cache, bool is_reloc);
struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_cache *cache, u64 bytenr, int level);
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
@@ -533,14 +533,15 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
- u64 bytenr, int errno)
+ u64 bytenr, int error)
{
- btrfs_panic(fs_info, errno,
+ btrfs_panic(fs_info, error,
"Inconsistency in backref cache found at offset %llu",
bytenr);
}
-int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
+int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_backref_cache *cache,
struct btrfs_path *path,
struct btrfs_backref_iter *iter,
struct btrfs_key *node_key,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 12b12443efaa..4f3b693a16b1 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -10,11 +10,11 @@
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
-#include "check-integrity.h"
#include "dev-replace.h"
#include "rcu-string.h"
#include "zoned.h"
#include "file-item.h"
+#include "raid-stripe-tree.h"
static struct bio_set btrfs_bioset;
static struct bio_set btrfs_clone_bioset;
@@ -416,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@@ -427,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
+ } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
/* Pass on control to the original bio this one was cloned from */
@@ -463,8 +468,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
dev->devid, bio->bi_iter.bi_size);
- btrfsic_check_bio(bio);
-
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
blkcg_punt_bio_submit(bio);
else
@@ -490,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
bioc->stripes[dev_nr].bioc = bioc;
+ bioc->size = bio->bi_iter.bi_size;
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
@@ -499,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
if (!bioc) {
/* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
+ if (bio_op(bio) != REQ_OP_READ)
+ btrfs_bio(bio)->orig_physical = smap->physical;
bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
if (bio_op(bio) != REQ_OP_READ)
btrfs_bio(bio)->orig_physical = smap->physical;
@@ -568,13 +574,20 @@ static void run_one_async_start(struct btrfs_work *work)
*
* At IO completion time the csums attached on the ordered extent record are
* inserted into the tree.
+ *
+ * If called with @do_free == true, then it will free the work struct.
*/
-static void run_one_async_done(struct btrfs_work *work)
+static void run_one_async_done(struct btrfs_work *work, bool do_free)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
struct bio *bio = &async->bbio->bio;
+ if (do_free) {
+ kfree(container_of(work, struct async_submit_bio, work));
+ return;
+ }
+
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
btrfs_orig_bbio_end_io(async->bbio);
@@ -590,11 +603,6 @@ static void run_one_async_done(struct btrfs_work *work)
__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
-static void run_one_async_free(struct btrfs_work *work)
-{
- kfree(container_of(work, struct async_submit_bio, work));
-}
-
static bool should_async_write(struct btrfs_bio *bbio)
{
/* Submit synchronously if the checksum implementation is fast. */
@@ -636,8 +644,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
async->smap = *smap;
async->mirror_num = mirror_num;
- btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
- run_one_async_free);
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
btrfs_queue_work(fs_info->workers, &async->work);
return true;
}
@@ -657,9 +664,11 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
blk_status_t ret;
int error;
+ smap.is_scrub = !bbio->inode;
+
btrfs_bio_counter_inc_blocked(fs_info);
error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num, 1);
+ &bioc, &smap, &mirror_num);
if (error) {
ret = errno_to_blk_status(error);
goto fail;
@@ -691,6 +700,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
+ if (is_data_bbio(bbio) && bioc &&
+ btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
+ /*
+ * No locking for the list update, as we only add to
+ * the list in the I/O submission path, and list
+ * iteration only happens in the completion path, which
+ * can't happen until after the last submission.
+ */
+ btrfs_get_bioc(bioc);
+ list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
+ }
+
/*
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.
@@ -779,8 +800,6 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
__bio_add_page(&bio, page, length, pg_offset);
-
- btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0cb1dee965a0..6e5dc68ff661 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -935,7 +935,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
caching_ctl->block_group = cache;
refcount_set(&caching_ctl->count, 2);
atomic_set(&caching_ctl->progress, 0);
- btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
@@ -1286,7 +1286,7 @@ out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
if (remove_rsv)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
btrfs_free_path(path);
return ret;
}
@@ -2601,7 +2601,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
return ret;
@@ -2709,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
}
@@ -2819,8 +2819,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
- trans->delayed_ref_updates++;
- btrfs_update_delayed_refs_rsv(trans);
+ btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
return cache;
@@ -3025,11 +3024,19 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
fail:
btrfs_release_path(path);
- /* We didn't update the block group item, need to revert @commit_used. */
- if (ret < 0) {
+ /*
+ * We didn't update the block group item, need to revert commit_used
+ * unless the block group item didn't exist yet - this is to prevent a
+ * race with a concurrent insertion of the block group item, with
+ * insert_block_group_item(), that happened just after we attempted to
+ * update. In that case we would reset commit_used to 0 just after the
+ * insertion set it to a value greater than 0 - if the block group later
+ * becomes with 0 used bytes, we would incorrectly skip its update.
+ */
+ if (ret < 0 && ret != -ENOENT) {
spin_lock(&cache->lock);
cache->commit_used = old_commit_used;
spin_unlock(&cache->lock);
@@ -3043,7 +3050,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
@@ -3095,7 +3101,7 @@ again:
* time.
*/
BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
/*
* So theoretically we could recover from this, simply set the
@@ -3362,7 +3368,7 @@ again:
if (should_put)
btrfs_put_block_group(cache);
if (drop_reserve)
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
@@ -3466,8 +3472,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans,
- (unsigned long) -1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
@@ -3510,7 +3515,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
/* If its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -3535,12 +3540,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_block_group *cache = NULL;
- u64 total = num_bytes;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_group *cache;
u64 old_val;
- u64 byte_in_group;
+ bool reclaim = false;
+ bool bg_already_dirty = true;
int factor;
- int ret = 0;
/* Block accounting for super block */
spin_lock(&info->delalloc_root_lock);
@@ -3552,97 +3557,86 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
btrfs_set_super_bytes_used(info->super_copy, old_val);
spin_unlock(&info->delalloc_root_lock);
- while (total) {
- struct btrfs_space_info *space_info;
- bool reclaim = false;
-
- cache = btrfs_lookup_block_group(info, bytenr);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
- space_info = cache->space_info;
- factor = btrfs_bg_type_to_factor(cache->flags);
+ cache = btrfs_lookup_block_group(info, bytenr);
+ if (!cache)
+ return -ENOENT;
- /*
- * If this block group has free space cache written out, we
- * need to make sure to load it if we are removing space. This
- * is because we need the unpinning stage to actually add the
- * space back to the block group, otherwise we will leak space.
- */
- if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, true);
+ /* An extent can not span multiple block groups. */
+ ASSERT(bytenr + num_bytes <= cache->start + cache->length);
- byte_in_group = bytenr - cache->start;
- WARN_ON(byte_in_group > cache->length);
+ space_info = cache->space_info;
+ factor = btrfs_bg_type_to_factor(cache->flags);
- spin_lock(&space_info->lock);
- spin_lock(&cache->lock);
+ /*
+ * If this block group has free space cache written out, we need to make
+ * sure to load it if we are removing space. This is because we need
+ * the unpinning stage to actually add the space back to the block group,
+ * otherwise we will leak space.
+ */
+ if (!alloc && !btrfs_block_group_done(cache))
+ btrfs_cache_block_group(cache, true);
- if (btrfs_test_opt(info, SPACE_CACHE) &&
- cache->disk_cache_state < BTRFS_DC_CLEAR)
- cache->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
- old_val = cache->used;
- num_bytes = min(total, cache->length - byte_in_group);
- if (alloc) {
- old_val += num_bytes;
- cache->used = old_val;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
- space_info->bytes_used += num_bytes;
- space_info->disk_used += num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
- } else {
- old_val -= num_bytes;
- cache->used = old_val;
- cache->pinned += num_bytes;
- btrfs_space_info_update_bytes_pinned(info, space_info,
- num_bytes);
- space_info->bytes_used -= num_bytes;
- space_info->disk_used -= num_bytes * factor;
+ if (btrfs_test_opt(info, SPACE_CACHE) &&
+ cache->disk_cache_state < BTRFS_DC_CLEAR)
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
- reclaim = should_reclaim_block_group(cache, num_bytes);
+ old_val = cache->used;
+ if (alloc) {
+ old_val += num_bytes;
+ cache->used = old_val;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ space_info->bytes_used += num_bytes;
+ space_info->disk_used += num_bytes * factor;
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ } else {
+ old_val -= num_bytes;
+ cache->used = old_val;
+ cache->pinned += num_bytes;
+ btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
+ space_info->bytes_used -= num_bytes;
+ space_info->disk_used -= num_bytes * factor;
- spin_unlock(&cache->lock);
- spin_unlock(&space_info->lock);
+ reclaim = should_reclaim_block_group(cache, num_bytes);
- set_extent_bit(&trans->transaction->pinned_extents,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_DIRTY, NULL);
- }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &trans->transaction->dirty_bgs);
- trans->delayed_ref_updates++;
- btrfs_get_block_group(cache);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
+ set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ }
- /*
- * No longer have used bytes in this block group, queue it for
- * deletion. We do this after adding the block group to the
- * dirty list to avoid races between cleaner kthread and space
- * cache writeout.
- */
- if (!alloc && old_val == 0) {
- if (!btrfs_test_opt(info, DISCARD_ASYNC))
- btrfs_mark_bg_unused(cache);
- } else if (!alloc && reclaim) {
- btrfs_mark_bg_to_reclaim(cache);
- }
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
- btrfs_put_block_group(cache);
- total -= num_bytes;
- bytenr += num_bytes;
+ /*
+ * No longer have used bytes in this block group, queue it for deletion.
+ * We do this after adding the block group to the dirty list to avoid
+ * races between cleaner kthread and space cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
+ btrfs_put_block_group(cache);
+
/* Modified block groups are accounted for in the delayed_refs_rsv. */
- btrfs_update_delayed_refs_rsv(trans);
- return ret;
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(info);
+
+ return 0;
}
/*
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 77684c5e0c8b..ceb5f586a2d5 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -221,7 +221,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
if (num_bytes == 0)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
@@ -261,7 +262,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
if (!ret)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
@@ -279,10 +281,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *target = NULL;
/*
- * If we are the delayed_rsv then push to the global rsv, otherwise dump
- * into the delayed rsv if it is not full.
+ * If we are a delayed block reserve then push to the global rsv,
+ * otherwise dump into the global delayed reserve if it is not full.
*/
- if (block_rsv == delayed_rsv)
+ if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
target = global_rsv;
else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
@@ -354,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
min_items++;
}
+ if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+ num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
+ min_items++;
+ }
+
/*
* But we also want to reserve enough space so we can do the fallback
* global reserve for an unlink, which is an additional
@@ -405,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
+ case BTRFS_RAID_STRIPE_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
@@ -517,8 +525,8 @@ again:
block_rsv->type, ret);
}
try_reserve:
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
- BTRFS_RESERVE_NO_FLUSH);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ blocksize, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
/*
@@ -539,7 +547,7 @@ try_reserve:
* one last time to force a reservation if there's enough actual space
* on disk to make the reservation.
*/
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
BTRFS_RESERVE_FLUSH_EMERGENCY);
if (!ret)
return block_rsv;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index bda1fdbba666..5572ae52444e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -8,6 +8,8 @@
#include <linux/hash.h>
#include <linux/refcount.h>
+#include <linux/fscrypt.h>
+#include <trace/events/btrfs.h>
#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"
@@ -79,11 +81,21 @@ struct btrfs_inode {
*/
struct btrfs_key location;
+ /* Cached value of inode property 'compression'. */
+ u8 prop_compress;
+
+ /*
+ * Force compression on the file using the defrag ioctl, could be
+ * different from prop_compress and takes precedence if set.
+ */
+ u8 defrag_compress;
+
/*
* Lock for counters and all fields used to determine if the inode is in
* the log or not (last_trans, last_sub_trans, last_log_commit,
- * logged_trans), to access/update new_delalloc_bytes and to update the
- * VFS' inode number of bytes used.
+ * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes,
+ * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to
+ * update the VFS' inode number of bytes used.
*/
spinlock_t lock;
@@ -102,8 +114,18 @@ struct btrfs_inode {
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
+ /*
+ * Counters to keep track of the number of extent item's we may use due
+ * to delalloc and such. outstanding_extents is the number of extent
+ * items we think we'll end up using, and reserved_extents is the number
+ * of extent items we've reserved metadata for. Protected by 'lock'.
+ */
+ unsigned outstanding_extents;
+
/* used to order data wrt metadata */
- struct btrfs_ordered_inode_tree ordered_tree;
+ spinlock_t ordered_tree_lock;
+ struct rb_root ordered_tree;
+ struct rb_node *ordered_tree_last;
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
@@ -122,28 +144,31 @@ struct btrfs_inode {
u64 generation;
/*
- * transid of the trans_handle that last modified this inode
+ * ID of the transaction handle that last modified this inode.
+ * Protected by 'lock'.
*/
u64 last_trans;
/*
- * transid that last logged this inode
+ * ID of the transaction that last logged this inode.
+ * Protected by 'lock'.
*/
u64 logged_trans;
/*
- * log transid when this inode was last modified
+ * Log transaction ID when this inode was last modified.
+ * Protected by 'lock'.
*/
int last_sub_trans;
- /* a local copy of root's last_log_commit */
+ /* A local copy of root's last_log_commit. Protected by 'lock'. */
int last_log_commit;
union {
/*
* Total number of bytes pending delalloc, used by stat to
* calculate the real block usage of the file. This is used
- * only for files.
+ * only for files. Protected by 'lock'.
*/
u64 delalloc_bytes;
/*
@@ -161,7 +186,7 @@ struct btrfs_inode {
* Total number of bytes pending delalloc that fall within a file
* range that is either a hole or beyond EOF (and no prealloc extent
* exists in the range). This is always <= delalloc_bytes and this
- * is used only for files.
+ * is used only for files. Protected by 'lock'.
*/
u64 new_delalloc_bytes;
/*
@@ -172,15 +197,15 @@ struct btrfs_inode {
};
/*
- * total number of bytes pending defrag, used by stat to check whether
- * it needs COW.
+ * Total number of bytes pending defrag, used by stat to check whether
+ * it needs COW. Protected by 'lock'.
*/
u64 defrag_bytes;
/*
- * the size of the file stored in the metadata on disk. data=ordered
+ * The size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
- * because not all the blocks are written yet.
+ * because not all the blocks are written yet. Protected by 'lock'.
*/
u64 disk_i_size;
@@ -214,7 +239,7 @@ struct btrfs_inode {
/*
* Number of bytes outstanding that are going to need csums. This is
- * used in ENOSPC accounting.
+ * used in ENOSPC accounting. Protected by 'lock'.
*/
u64 csum_bytes;
@@ -223,30 +248,13 @@ struct btrfs_inode {
/* Read-only compatibility flags, upper half of inode_item::flags */
u32 ro_flags;
- /*
- * Counters to keep track of the number of extent item's we may use due
- * to delalloc and such. outstanding_extents is the number of extent
- * items we think we'll end up using, and reserved_extents is the number
- * of extent items we've reserved metadata for.
- */
- unsigned outstanding_extents;
-
struct btrfs_block_rsv block_rsv;
- /*
- * Cached values of inode properties
- */
- unsigned prop_compress; /* per-file compression algorithm */
- /*
- * Force compression on the file using the defrag ioctl, could be
- * different from prop_compress and takes precedence if set
- */
- unsigned defrag_compress;
-
struct btrfs_delayed_node *delayed_node;
/* File creation time. */
- struct timespec64 i_otime;
+ u64 i_otime_sec;
+ u32 i_otime_nsec;
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
@@ -387,7 +395,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
- inode->last_sub_trans <= inode->root->last_log_commit)
+ inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root))
ret = true;
spin_unlock(&inode->lock);
return ret;
@@ -481,9 +489,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
+ struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode);
+ struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
deleted file mode 100644
index 3caf339c4bb3..000000000000
--- a/fs/btrfs/check-integrity.c
+++ /dev/null
@@ -1,2871 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) STRATO AG 2011. All rights reserved.
- */
-
-/*
- * This module can be used to catch cases when the btrfs kernel
- * code executes write requests to the disk that bring the file
- * system in an inconsistent state. In such a state, a power-loss
- * or kernel panic event would cause that the data on disk is
- * lost or at least damaged.
- *
- * Code is added that examines all block write requests during
- * runtime (including writes of the super block). Three rules
- * are verified and an error is printed on violation of the
- * rules:
- * 1. It is not allowed to write a disk block which is
- * currently referenced by the super block (either directly
- * or indirectly).
- * 2. When a super block is written, it is verified that all
- * referenced (directly or indirectly) blocks fulfill the
- * following requirements:
- * 2a. All referenced blocks have either been present when
- * the file system was mounted, (i.e., they have been
- * referenced by the super block) or they have been
- * written since then and the write completion callback
- * was called and no write error was indicated and a
- * FLUSH request to the device where these blocks are
- * located was received and completed.
- * 2b. All referenced blocks need to have a generation
- * number which is equal to the parent's number.
- *
- * One issue that was found using this module was that the log
- * tree on disk became temporarily corrupted because disk blocks
- * that had been in use for the log tree had been freed and
- * reused too early, while being referenced by the written super
- * block.
- *
- * The search term in the kernel log that can be used to filter
- * on the existence of detected integrity issues is
- * "btrfs: attempt".
- *
- * The integrity check is enabled via mount options. These
- * mount options are only supported if the integrity check
- * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
- *
- * Example #1, apply integrity checks to all metadata:
- * mount /dev/sdb1 /mnt -o check_int
- *
- * Example #2, apply integrity checks to all metadata and
- * to data extents:
- * mount /dev/sdb1 /mnt -o check_int_data
- *
- * Example #3, apply integrity checks to all metadata and dump
- * the tree that the super block references to kernel messages
- * each time after a super block was written:
- * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
- *
- * If the integrity check tool is included and activated in
- * the mount options, plenty of kernel memory is used, and
- * plenty of additional CPU cycles are spent. Enabling this
- * functionality is not intended for normal use. In most
- * cases, unless you are a btrfs developer who needs to verify
- * the integrity of (super)-block write requests, do not
- * enable the config option BTRFS_FS_CHECK_INTEGRITY to
- * include and compile the integrity check tool.
- *
- * Expect millions of lines of information in the kernel log with an
- * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
- * kernel config to at least 26 (which is 64MB). Usually the value is
- * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
- * changed like this before LOG_BUF_SHIFT can be set to a high value:
- * config LOG_BUF_SHIFT
- * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
- * range 12 30
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/blkdev.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <crypto/hash.h>
-#include "messages.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "transaction.h"
-#include "extent_io.h"
-#include "volumes.h"
-#include "print-tree.h"
-#include "locking.h"
-#include "check-integrity.h"
-#include "rcu-string.h"
-#include "compression.h"
-#include "accessors.h"
-
-#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
-#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
-#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
-#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
-#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
-#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
-#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
- * excluding " [...]" */
-#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
-
-/*
- * The definition of the bitmask fields for the print_mask.
- * They are specified with the mount option check_integrity_print_mask.
- */
-#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
-#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
-#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
-#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
-#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
-#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
-#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
-#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
-#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
-#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
-#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
-#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
-#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000
-
-struct btrfsic_dev_state;
-struct btrfsic_state;
-
-struct btrfsic_block {
- u32 magic_num; /* only used for debug purposes */
- unsigned int is_metadata:1; /* if it is meta-data, not data-data */
- unsigned int is_superblock:1; /* if it is one of the superblocks */
- unsigned int is_iodone:1; /* if is done by lower subsystem */
- unsigned int iodone_w_error:1; /* error was indicated to endio */
- unsigned int never_written:1; /* block was added because it was
- * referenced, not because it was
- * written */
- unsigned int mirror_num; /* large enough to hold
- * BTRFS_SUPER_MIRROR_MAX */
- struct btrfsic_dev_state *dev_state;
- u64 dev_bytenr; /* key, physical byte num on disk */
- u64 logical_bytenr; /* logical byte num on disk */
- u64 generation;
- struct btrfs_disk_key disk_key; /* extra info to print in case of
- * issues, will not always be correct */
- struct list_head collision_resolving_node; /* list node */
- struct list_head all_blocks_node; /* list node */
-
- /* the following two lists contain block_link items */
- struct list_head ref_to_list; /* list */
- struct list_head ref_from_list; /* list */
- struct btrfsic_block *next_in_same_bio;
- void *orig_bio_private;
- bio_end_io_t *orig_bio_end_io;
- blk_opf_t submit_bio_bh_rw;
- u64 flush_gen; /* only valid if !never_written */
-};
-
-/*
- * Elements of this type are allocated dynamically and required because
- * each block object can refer to and can be ref from multiple blocks.
- * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block referred from.
- * The fact that they are searchable via a hashtable and that a
- * ref_cnt is maintained is not required for the btrfs integrity
- * check algorithm itself, it is only used to make the output more
- * beautiful in case that an error is detected (an error is defined
- * as a write operation to a block while that block is still referenced).
- */
-struct btrfsic_block_link {
- u32 magic_num; /* only used for debug purposes */
- u32 ref_cnt;
- struct list_head node_ref_to; /* list node */
- struct list_head node_ref_from; /* list node */
- struct list_head collision_resolving_node; /* list node */
- struct btrfsic_block *block_ref_to;
- struct btrfsic_block *block_ref_from;
- u64 parent_generation;
-};
-
-struct btrfsic_dev_state {
- u32 magic_num; /* only used for debug purposes */
- struct block_device *bdev;
- struct btrfsic_state *state;
- struct list_head collision_resolving_node; /* list node */
- struct btrfsic_block dummy_block_for_bio_bh_flush;
- u64 last_flush_gen;
-};
-
-struct btrfsic_block_hashtable {
- struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_link_hashtable {
- struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
-};
-
-struct btrfsic_dev_state_hashtable {
- struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
-};
-
-struct btrfsic_block_data_ctx {
- u64 start; /* virtual bytenr */
- u64 dev_bytenr; /* physical bytenr on device */
- u32 len;
- struct btrfsic_dev_state *dev;
- char **datav;
- struct page **pagev;
- void *mem_to_free;
-};
-
-/* This structure is used to implement recursion without occupying
- * any stack space, refer to btrfsic_process_metablock() */
-struct btrfsic_stack_frame {
- u32 magic;
- u32 nr;
- int error;
- int i;
- int limit_nesting;
- int num_copies;
- int mirror_num;
- struct btrfsic_block *block;
- struct btrfsic_block_data_ctx *block_ctx;
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx next_block_ctx;
- struct btrfs_header *hdr;
- struct btrfsic_stack_frame *prev;
-};
-
-/* Some state per mounted filesystem */
-struct btrfsic_state {
- u32 print_mask;
- int include_extent_data;
- struct list_head all_blocks_list;
- struct btrfsic_block_hashtable block_hashtable;
- struct btrfsic_block_link_hashtable block_link_hashtable;
- struct btrfs_fs_info *fs_info;
- u64 max_superblock_generation;
- struct btrfsic_block *latest_superblock;
- u32 metablock_size;
- u32 datablock_size;
-};
-
-static int btrfsic_process_metablock(struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- int limit_nesting, int force_iodone_flag);
-static void btrfsic_read_from_block_data(
- struct btrfsic_block_data_ctx *block_ctx,
- void *dst, u32 offset, size_t len);
-static int btrfsic_create_link_to_next_block(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx
- *block_ctx, u64 next_bytenr,
- int limit_nesting,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block **next_blockp,
- int force_iodone_flag,
- int *num_copiesp, int *mirror_nump,
- struct btrfs_disk_key *disk_key,
- u64 parent_generation);
-static int btrfsic_handle_extent_data(struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u32 item_offset, int force_iodone_flag);
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
- struct btrfsic_block_data_ctx *block_ctx_out,
- int mirror_num);
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_read_block(struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx);
-static int btrfsic_process_written_superblock(
- struct btrfsic_state *state,
- struct btrfsic_block *const block,
- struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp);
-static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int recursion_level);
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
- struct btrfsic_block *const block,
- int recursion_level);
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l);
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l);
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
- const struct btrfsic_block *block);
-static void btrfsic_dump_tree(const struct btrfsic_state *state);
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int indent_level);
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block *next_block,
- struct btrfsic_block *from_block,
- u64 parent_generation);
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx,
- const char *additional_string,
- int is_metadata,
- int is_iodone,
- int never_written,
- int mirror_num,
- int *was_created);
-static int btrfsic_process_superblock_dev_mirror(
- struct btrfsic_state *state,
- struct btrfsic_dev_state *dev_state,
- struct btrfs_device *device,
- int superblock_mirror_num,
- struct btrfsic_dev_state **selected_dev_state,
- struct btrfs_super_block *selected_super);
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev);
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
- u64 bytenr,
- struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr);
-
-static struct mutex btrfsic_mutex;
-static int btrfsic_is_initialized;
-static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
-
-
-static void btrfsic_block_init(struct btrfsic_block *b)
-{
- b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
- b->dev_state = NULL;
- b->dev_bytenr = 0;
- b->logical_bytenr = 0;
- b->generation = BTRFSIC_GENERATION_UNKNOWN;
- b->disk_key.objectid = 0;
- b->disk_key.type = 0;
- b->disk_key.offset = 0;
- b->is_metadata = 0;
- b->is_superblock = 0;
- b->is_iodone = 0;
- b->iodone_w_error = 0;
- b->never_written = 0;
- b->mirror_num = 0;
- b->next_in_same_bio = NULL;
- b->orig_bio_private = NULL;
- b->orig_bio_end_io = NULL;
- INIT_LIST_HEAD(&b->collision_resolving_node);
- INIT_LIST_HEAD(&b->all_blocks_node);
- INIT_LIST_HEAD(&b->ref_to_list);
- INIT_LIST_HEAD(&b->ref_from_list);
- b->submit_bio_bh_rw = 0;
- b->flush_gen = 0;
-}
-
-static struct btrfsic_block *btrfsic_block_alloc(void)
-{
- struct btrfsic_block *b;
-
- b = kzalloc(sizeof(*b), GFP_NOFS);
- if (NULL != b)
- btrfsic_block_init(b);
-
- return b;
-}
-
-static void btrfsic_block_free(struct btrfsic_block *b)
-{
- BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
- kfree(b);
-}
-
-static void btrfsic_block_link_init(struct btrfsic_block_link *l)
-{
- l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
- l->ref_cnt = 1;
- INIT_LIST_HEAD(&l->node_ref_to);
- INIT_LIST_HEAD(&l->node_ref_from);
- INIT_LIST_HEAD(&l->collision_resolving_node);
- l->block_ref_to = NULL;
- l->block_ref_from = NULL;
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
-{
- struct btrfsic_block_link *l;
-
- l = kzalloc(sizeof(*l), GFP_NOFS);
- if (NULL != l)
- btrfsic_block_link_init(l);
-
- return l;
-}
-
-static void btrfsic_block_link_free(struct btrfsic_block_link *l)
-{
- BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
- kfree(l);
-}
-
-static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
-{
- ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
- ds->bdev = NULL;
- ds->state = NULL;
- INIT_LIST_HEAD(&ds->collision_resolving_node);
- ds->last_flush_gen = 0;
- btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
- ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
- ds->dummy_block_for_bio_bh_flush.dev_state = ds;
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
-{
- struct btrfsic_dev_state *ds;
-
- ds = kzalloc(sizeof(*ds), GFP_NOFS);
- if (NULL != ds)
- btrfsic_dev_state_init(ds);
-
- return ds;
-}
-
-static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
-{
- BUG_ON(!(NULL == ds ||
- BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
- kfree(ds);
-}
-
-static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
- struct btrfsic_block_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(b->dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
- (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-
- list_add(&b->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
-{
- list_del(&b->collision_resolving_node);
-}
-
-static struct btrfsic_block *btrfsic_block_hashtable_lookup(
- struct block_device *bdev,
- u64 dev_bytenr,
- struct btrfsic_block_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)bdev))) &
- (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
- struct btrfsic_block *b;
-
- list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
- if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
- return b;
- }
-
- return NULL;
-}
-
-static void btrfsic_block_link_hashtable_init(
- struct btrfsic_block_link_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_block_link_hashtable_add(
- struct btrfsic_block_link *l,
- struct btrfsic_block_link_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
- ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
- ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
- ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
- & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-
- BUG_ON(NULL == l->block_ref_to);
- BUG_ON(NULL == l->block_ref_from);
- list_add(&l->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
-{
- list_del(&l->collision_resolving_node);
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
- struct block_device *bdev_ref_to,
- u64 dev_bytenr_ref_to,
- struct block_device *bdev_ref_from,
- u64 dev_bytenr_ref_from,
- struct btrfsic_block_link_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
- ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
- ((unsigned int)((uintptr_t)bdev_ref_to)) ^
- ((unsigned int)((uintptr_t)bdev_ref_from))) &
- (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
- struct btrfsic_block_link *l;
-
- list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
- BUG_ON(NULL == l->block_ref_to);
- BUG_ON(NULL == l->block_ref_from);
- if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
- l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
- l->block_ref_from->dev_state->bdev == bdev_ref_from &&
- l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
- return l;
- }
-
- return NULL;
-}
-
-static void btrfsic_dev_state_hashtable_init(
- struct btrfsic_dev_state_hashtable *h)
-{
- int i;
-
- for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
- INIT_LIST_HEAD(h->table + i);
-}
-
-static void btrfsic_dev_state_hashtable_add(
- struct btrfsic_dev_state *ds,
- struct btrfsic_dev_state_hashtable *h)
-{
- const unsigned int hashval =
- (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) &
- (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-
- list_add(&ds->collision_resolving_node, h->table + hashval);
-}
-
-static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
-{
- list_del(&ds->collision_resolving_node);
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
- struct btrfsic_dev_state_hashtable *h)
-{
- const unsigned int hashval =
- dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1);
- struct btrfsic_dev_state *ds;
-
- list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
- if (ds->bdev->bd_dev == dev)
- return ds;
- }
-
- return NULL;
-}
-
-static int btrfsic_process_superblock(struct btrfsic_state *state,
- struct btrfs_fs_devices *fs_devices)
-{
- struct btrfs_super_block *selected_super;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
- struct btrfsic_dev_state *selected_dev_state = NULL;
- int ret = 0;
- int pass;
-
- selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
- if (!selected_super)
- return -ENOMEM;
-
- list_for_each_entry(device, dev_head, dev_list) {
- int i;
- struct btrfsic_dev_state *dev_state;
-
- if (!device->bdev || !device->name)
- continue;
-
- dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev);
- BUG_ON(NULL == dev_state);
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- ret = btrfsic_process_superblock_dev_mirror(
- state, dev_state, device, i,
- &selected_dev_state, selected_super);
- if (0 != ret && 0 == i) {
- kfree(selected_super);
- return ret;
- }
- }
- }
-
- if (NULL == state->latest_superblock) {
- pr_info("btrfsic: no superblock found!\n");
- kfree(selected_super);
- return -1;
- }
-
- for (pass = 0; pass < 3; pass++) {
- int num_copies;
- int mirror_num;
- u64 next_bytenr;
-
- switch (pass) {
- case 0:
- next_bytenr = btrfs_super_root(selected_super);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("root@%llu\n", next_bytenr);
- break;
- case 1:
- next_bytenr = btrfs_super_chunk_root(selected_super);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("chunk@%llu\n", next_bytenr);
- break;
- case 2:
- next_bytenr = btrfs_super_log_root(selected_super);
- if (0 == next_bytenr)
- continue;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("log@%llu\n", next_bytenr);
- break;
- }
-
- num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
-
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
-
- ret = btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- &tmp_next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n",
- next_bytenr, mirror_num);
- kfree(selected_super);
- return -1;
- }
-
- next_block = btrfsic_block_hashtable_lookup(
- tmp_next_block_ctx.dev->bdev,
- tmp_next_block_ctx.dev_bytenr,
- &state->block_hashtable);
- BUG_ON(NULL == next_block);
-
- l = btrfsic_block_link_hashtable_lookup(
- tmp_next_block_ctx.dev->bdev,
- tmp_next_block_ctx.dev_bytenr,
- state->latest_superblock->dev_state->
- bdev,
- state->latest_superblock->dev_bytenr,
- &state->block_link_hashtable);
- BUG_ON(NULL == l);
-
- ret = btrfsic_read_block(state, &tmp_next_block_ctx);
- if (ret < (int)PAGE_SIZE) {
- pr_info("btrfsic: read @logical %llu failed!\n",
- tmp_next_block_ctx.start);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- kfree(selected_super);
- return -1;
- }
-
- ret = btrfsic_process_metablock(state,
- next_block,
- &tmp_next_block_ctx,
- BTRFS_MAX_LEVEL + 3, 1);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- }
- }
-
- kfree(selected_super);
- return ret;
-}
-
-static int btrfsic_process_superblock_dev_mirror(
- struct btrfsic_state *state,
- struct btrfsic_dev_state *dev_state,
- struct btrfs_device *device,
- int superblock_mirror_num,
- struct btrfsic_dev_state **selected_dev_state,
- struct btrfs_super_block *selected_super)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfs_super_block *super_tmp;
- u64 dev_bytenr;
- struct btrfsic_block *superblock_tmp;
- int pass;
- struct block_device *const superblock_bdev = device->bdev;
- struct page *page;
- struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
- int ret = 0;
-
- /* super block bytenr is always the unmapped device bytenr */
- dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
- if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
- return -1;
-
- page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
- if (IS_ERR(page))
- return -1;
-
- super_tmp = page_address(page);
-
- if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
- btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
- memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
- btrfs_super_nodesize(super_tmp) != state->metablock_size ||
- btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
- ret = 0;
- goto out;
- }
-
- superblock_tmp =
- btrfsic_block_hashtable_lookup(superblock_bdev,
- dev_bytenr,
- &state->block_hashtable);
- if (NULL == superblock_tmp) {
- superblock_tmp = btrfsic_block_alloc();
- if (NULL == superblock_tmp) {
- ret = -1;
- goto out;
- }
- /* for superblock, only the dev_bytenr makes sense */
- superblock_tmp->dev_bytenr = dev_bytenr;
- superblock_tmp->dev_state = dev_state;
- superblock_tmp->logical_bytenr = dev_bytenr;
- superblock_tmp->generation = btrfs_super_generation(super_tmp);
- superblock_tmp->is_metadata = 1;
- superblock_tmp->is_superblock = 1;
- superblock_tmp->is_iodone = 1;
- superblock_tmp->never_written = 0;
- superblock_tmp->mirror_num = 1 + superblock_mirror_num;
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- btrfs_info_in_rcu(fs_info,
- "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
- superblock_bdev,
- btrfs_dev_name(device), dev_bytenr,
- dev_state->bdev, dev_bytenr,
- superblock_mirror_num);
- list_add(&superblock_tmp->all_blocks_node,
- &state->all_blocks_list);
- btrfsic_block_hashtable_add(superblock_tmp,
- &state->block_hashtable);
- }
-
- /* select the one with the highest generation field */
- if (btrfs_super_generation(super_tmp) >
- state->max_superblock_generation ||
- 0 == state->max_superblock_generation) {
- memcpy(selected_super, super_tmp, sizeof(*selected_super));
- *selected_dev_state = dev_state;
- state->max_superblock_generation =
- btrfs_super_generation(super_tmp);
- state->latest_superblock = superblock_tmp;
- }
-
- for (pass = 0; pass < 3; pass++) {
- u64 next_bytenr;
- int num_copies;
- int mirror_num;
- const char *additional_string = NULL;
- struct btrfs_disk_key tmp_disk_key;
-
- tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
- tmp_disk_key.offset = 0;
- switch (pass) {
- case 0:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_TREE_OBJECTID);
- additional_string = "initial root ";
- next_bytenr = btrfs_super_root(super_tmp);
- break;
- case 1:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_CHUNK_TREE_OBJECTID);
- additional_string = "initial chunk ";
- next_bytenr = btrfs_super_chunk_root(super_tmp);
- break;
- case 2:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_TREE_LOG_OBJECTID);
- additional_string = "initial log ";
- next_bytenr = btrfs_super_log_root(super_tmp);
- if (0 == next_bytenr)
- continue;
- break;
- }
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
-
- if (btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- &tmp_next_block_ctx,
- mirror_num)) {
- pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
- next_bytenr, mirror_num);
- ret = -1;
- goto out;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state, &tmp_next_block_ctx,
- additional_string, 1, 1, 0,
- mirror_num, NULL);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- ret = -1;
- goto out;
- }
-
- next_block->disk_key = tmp_disk_key;
- next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
- l = btrfsic_block_link_lookup_or_add(
- state, &tmp_next_block_ctx,
- next_block, superblock_tmp,
- BTRFSIC_GENERATION_UNKNOWN);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- if (NULL == l) {
- ret = -1;
- goto out;
- }
- }
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
- btrfsic_dump_tree_sub(state, superblock_tmp, 0);
-
-out:
- put_page(page);
- return ret;
-}
-
-static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
-{
- struct btrfsic_stack_frame *sf;
-
- sf = kzalloc(sizeof(*sf), GFP_NOFS);
- if (sf)
- sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
- return sf;
-}
-
-static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
-{
- BUG_ON(!(NULL == sf ||
- BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
- kfree(sf);
-}
-
-static noinline_for_stack int btrfsic_process_metablock(
- struct btrfsic_state *state,
- struct btrfsic_block *const first_block,
- struct btrfsic_block_data_ctx *const first_block_ctx,
- int first_limit_nesting, int force_iodone_flag)
-{
- struct btrfsic_stack_frame initial_stack_frame = { 0 };
- struct btrfsic_stack_frame *sf;
- struct btrfsic_stack_frame *next_stack;
- struct btrfs_header *const first_hdr =
- (struct btrfs_header *)first_block_ctx->datav[0];
-
- BUG_ON(!first_hdr);
- sf = &initial_stack_frame;
- sf->error = 0;
- sf->i = -1;
- sf->limit_nesting = first_limit_nesting;
- sf->block = first_block;
- sf->block_ctx = first_block_ctx;
- sf->next_block = NULL;
- sf->hdr = first_hdr;
- sf->prev = NULL;
-
-continue_with_new_stack_frame:
- sf->block->generation = btrfs_stack_header_generation(sf->hdr);
- if (0 == sf->hdr->level) {
- struct btrfs_leaf *const leafhdr =
- (struct btrfs_leaf *)sf->hdr;
-
- if (-1 == sf->i) {
- sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("leaf %llu items %d generation %llu owner %llu\n",
- sf->block_ctx->start, sf->nr,
- btrfs_stack_header_generation(
- &leafhdr->header),
- btrfs_stack_header_owner(
- &leafhdr->header));
- }
-
-continue_with_current_leaf_stack_frame:
- if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
- sf->i++;
- sf->num_copies = 0;
- }
-
- if (sf->i < sf->nr) {
- struct btrfs_item disk_item;
- u32 disk_item_offset =
- (uintptr_t)(leafhdr->items + sf->i) -
- (uintptr_t)leafhdr;
- struct btrfs_disk_key *disk_key;
- u8 type;
- u32 item_offset;
- u32 item_size;
-
- if (disk_item_offset + sizeof(struct btrfs_item) >
- sf->block_ctx->len) {
-leaf_item_out_of_bounce_error:
- pr_info(
- "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
- sf->block_ctx->start,
- sf->block_ctx->dev->bdev);
- goto one_stack_frame_backwards;
- }
- btrfsic_read_from_block_data(sf->block_ctx,
- &disk_item,
- disk_item_offset,
- sizeof(struct btrfs_item));
- item_offset = btrfs_stack_item_offset(&disk_item);
- item_size = btrfs_stack_item_size(&disk_item);
- disk_key = &disk_item.key;
- type = btrfs_disk_key_type(disk_key);
-
- if (BTRFS_ROOT_ITEM_KEY == type) {
- struct btrfs_root_item root_item;
- u32 root_item_offset;
- u64 next_bytenr;
-
- root_item_offset = item_offset +
- offsetof(struct btrfs_leaf, items);
- if (root_item_offset + item_size >
- sf->block_ctx->len)
- goto leaf_item_out_of_bounce_error;
- btrfsic_read_from_block_data(
- sf->block_ctx, &root_item,
- root_item_offset,
- item_size);
- next_bytenr = btrfs_root_bytenr(&root_item);
-
- sf->error =
- btrfsic_create_link_to_next_block(
- state,
- sf->block,
- sf->block_ctx,
- next_bytenr,
- sf->limit_nesting,
- &sf->next_block_ctx,
- &sf->next_block,
- force_iodone_flag,
- &sf->num_copies,
- &sf->mirror_num,
- disk_key,
- btrfs_root_generation(
- &root_item));
- if (sf->error)
- goto one_stack_frame_backwards;
-
- if (NULL != sf->next_block) {
- struct btrfs_header *const next_hdr =
- (struct btrfs_header *)
- sf->next_block_ctx.datav[0];
-
- next_stack =
- btrfsic_stack_frame_alloc();
- if (NULL == next_stack) {
- sf->error = -1;
- btrfsic_release_block_ctx(
- &sf->
- next_block_ctx);
- goto one_stack_frame_backwards;
- }
-
- next_stack->i = -1;
- next_stack->block = sf->next_block;
- next_stack->block_ctx =
- &sf->next_block_ctx;
- next_stack->next_block = NULL;
- next_stack->hdr = next_hdr;
- next_stack->limit_nesting =
- sf->limit_nesting - 1;
- next_stack->prev = sf;
- sf = next_stack;
- goto continue_with_new_stack_frame;
- }
- } else if (BTRFS_EXTENT_DATA_KEY == type &&
- state->include_extent_data) {
- sf->error = btrfsic_handle_extent_data(
- state,
- sf->block,
- sf->block_ctx,
- item_offset,
- force_iodone_flag);
- if (sf->error)
- goto one_stack_frame_backwards;
- }
-
- goto continue_with_current_leaf_stack_frame;
- }
- } else {
- struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
-
- if (-1 == sf->i) {
- sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("node %llu level %d items %d generation %llu owner %llu\n",
- sf->block_ctx->start,
- nodehdr->header.level, sf->nr,
- btrfs_stack_header_generation(
- &nodehdr->header),
- btrfs_stack_header_owner(
- &nodehdr->header));
- }
-
-continue_with_current_node_stack_frame:
- if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
- sf->i++;
- sf->num_copies = 0;
- }
-
- if (sf->i < sf->nr) {
- struct btrfs_key_ptr key_ptr;
- u32 key_ptr_offset;
- u64 next_bytenr;
-
- key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
- (uintptr_t)nodehdr;
- if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
- sf->block_ctx->len) {
- pr_info(
- "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
- sf->block_ctx->start,
- sf->block_ctx->dev->bdev);
- goto one_stack_frame_backwards;
- }
- btrfsic_read_from_block_data(
- sf->block_ctx, &key_ptr, key_ptr_offset,
- sizeof(struct btrfs_key_ptr));
- next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
-
- sf->error = btrfsic_create_link_to_next_block(
- state,
- sf->block,
- sf->block_ctx,
- next_bytenr,
- sf->limit_nesting,
- &sf->next_block_ctx,
- &sf->next_block,
- force_iodone_flag,
- &sf->num_copies,
- &sf->mirror_num,
- &key_ptr.key,
- btrfs_stack_key_generation(&key_ptr));
- if (sf->error)
- goto one_stack_frame_backwards;
-
- if (NULL != sf->next_block) {
- struct btrfs_header *const next_hdr =
- (struct btrfs_header *)
- sf->next_block_ctx.datav[0];
-
- next_stack = btrfsic_stack_frame_alloc();
- if (NULL == next_stack) {
- sf->error = -1;
- goto one_stack_frame_backwards;
- }
-
- next_stack->i = -1;
- next_stack->block = sf->next_block;
- next_stack->block_ctx = &sf->next_block_ctx;
- next_stack->next_block = NULL;
- next_stack->hdr = next_hdr;
- next_stack->limit_nesting =
- sf->limit_nesting - 1;
- next_stack->prev = sf;
- sf = next_stack;
- goto continue_with_new_stack_frame;
- }
-
- goto continue_with_current_node_stack_frame;
- }
- }
-
-one_stack_frame_backwards:
- if (NULL != sf->prev) {
- struct btrfsic_stack_frame *const prev = sf->prev;
-
- /* the one for the initial block is freed in the caller */
- btrfsic_release_block_ctx(sf->block_ctx);
-
- if (sf->error) {
- prev->error = sf->error;
- btrfsic_stack_frame_free(sf);
- sf = prev;
- goto one_stack_frame_backwards;
- }
-
- btrfsic_stack_frame_free(sf);
- sf = prev;
- goto continue_with_new_stack_frame;
- } else {
- BUG_ON(&initial_stack_frame != sf);
- }
-
- return sf->error;
-}
-
-static void btrfsic_read_from_block_data(
- struct btrfsic_block_data_ctx *block_ctx,
- void *dstv, u32 offset, size_t len)
-{
- size_t cur;
- size_t pgoff;
- char *kaddr;
- char *dst = (char *)dstv;
- size_t start_offset = offset_in_page(block_ctx->start);
- unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
-
- WARN_ON(offset + len > block_ctx->len);
- pgoff = offset_in_page(start_offset + offset);
-
- while (len > 0) {
- cur = min(len, ((size_t)PAGE_SIZE - pgoff));
- BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
- kaddr = block_ctx->datav[i];
- memcpy(dst, kaddr + pgoff, cur);
-
- dst += cur;
- len -= cur;
- pgoff = 0;
- i++;
- }
-}
-
-static int btrfsic_create_link_to_next_block(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u64 next_bytenr,
- int limit_nesting,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block **next_blockp,
- int force_iodone_flag,
- int *num_copiesp, int *mirror_nump,
- struct btrfs_disk_key *disk_key,
- u64 parent_generation)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfsic_block *next_block = NULL;
- int ret;
- struct btrfsic_block_link *l;
- int did_alloc_block_link;
- int block_was_created;
-
- *next_blockp = NULL;
- if (0 == *num_copiesp) {
- *num_copiesp = btrfs_num_copies(fs_info, next_bytenr,
- state->metablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, *num_copiesp);
- *mirror_nump = 1;
- }
-
- if (*mirror_nump > *num_copiesp)
- return 0;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n",
- *mirror_nump);
- ret = btrfsic_map_block(state, next_bytenr,
- state->metablock_size,
- next_block_ctx, *mirror_nump);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, *mirror_nump);
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(state,
- next_block_ctx, "referenced ",
- 1, force_iodone_flag,
- !force_iodone_flag,
- *mirror_nump,
- &block_was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
- if (block_was_created) {
- l = NULL;
- next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
- } else {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
- if (next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr))
- pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
- next_bytenr, next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state,
- next_block),
- next_block->logical_bytenr);
- else
- pr_info(
- "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- next_bytenr, next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state,
- next_block));
- }
- next_block->logical_bytenr = next_bytenr;
-
- next_block->mirror_num = *mirror_nump;
- l = btrfsic_block_link_hashtable_lookup(
- next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr,
- block_ctx->dev->bdev,
- block_ctx->dev_bytenr,
- &state->block_link_hashtable);
- }
-
- next_block->disk_key = *disk_key;
- if (NULL == l) {
- l = btrfsic_block_link_alloc();
- if (NULL == l) {
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- did_alloc_block_link = 1;
- l->block_ref_to = next_block;
- l->block_ref_from = block;
- l->ref_cnt = 1;
- l->parent_generation = parent_generation;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
-
- list_add(&l->node_ref_to, &block->ref_to_list);
- list_add(&l->node_ref_from, &next_block->ref_from_list);
-
- btrfsic_block_link_hashtable_add(l,
- &state->block_link_hashtable);
- } else {
- did_alloc_block_link = 0;
- if (0 == limit_nesting) {
- l->ref_cnt++;
- l->parent_generation = parent_generation;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
- }
- }
-
- if (limit_nesting > 0 && did_alloc_block_link) {
- ret = btrfsic_read_block(state, next_block_ctx);
- if (ret < (int)next_block_ctx->len) {
- pr_info("btrfsic: read block @logical %llu failed!\n",
- next_bytenr);
- btrfsic_release_block_ctx(next_block_ctx);
- *next_blockp = NULL;
- return -1;
- }
-
- *next_blockp = next_block;
- } else {
- *next_blockp = NULL;
- }
- (*mirror_nump)++;
-
- return 0;
-}
-
-static int btrfsic_handle_extent_data(
- struct btrfsic_state *state,
- struct btrfsic_block *block,
- struct btrfsic_block_data_ctx *block_ctx,
- u32 item_offset, int force_iodone_flag)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfs_file_extent_item file_extent_item;
- u64 file_extent_item_offset;
- u64 next_bytenr;
- u64 num_bytes;
- u64 generation;
- struct btrfsic_block_link *l;
- int ret;
-
- file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
- item_offset;
- if (file_extent_item_offset +
- offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
- block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
- block_ctx->start, block_ctx->dev->bdev);
- return -1;
- }
-
- btrfsic_read_from_block_data(block_ctx, &file_extent_item,
- file_extent_item_offset,
- offsetof(struct btrfs_file_extent_item, disk_num_bytes));
- if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
- btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("extent_data: type %u, disk_bytenr = %llu\n",
- file_extent_item.type,
- btrfs_stack_file_extent_disk_bytenr(
- &file_extent_item));
- return 0;
- }
-
- if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
- block_ctx->len) {
- pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
- block_ctx->start, block_ctx->dev->bdev);
- return -1;
- }
- btrfsic_read_from_block_data(block_ctx, &file_extent_item,
- file_extent_item_offset,
- sizeof(struct btrfs_file_extent_item));
- next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
- if (btrfs_stack_file_extent_compression(&file_extent_item) ==
- BTRFS_COMPRESS_NONE) {
- next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
- num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
- } else {
- num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
- }
- generation = btrfs_stack_file_extent_generation(&file_extent_item);
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n",
- file_extent_item.type,
- btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
- btrfs_stack_file_extent_offset(&file_extent_item),
- num_bytes);
- while (num_bytes > 0) {
- u32 chunk_len;
- int num_copies;
- int mirror_num;
-
- if (num_bytes > state->datablock_size)
- chunk_len = state->datablock_size;
- else
- chunk_len = num_bytes;
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- state->datablock_size);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- struct btrfsic_block_data_ctx next_block_ctx;
- struct btrfsic_block *next_block;
- int block_was_created;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n",
- mirror_num);
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
- pr_info("\tdisk_bytenr = %llu, num_bytes %u\n",
- next_bytenr, chunk_len);
- ret = btrfsic_map_block(state, next_bytenr,
- chunk_len, &next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, mirror_num);
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state,
- &next_block_ctx,
- "referenced ",
- 0,
- force_iodone_flag,
- !force_iodone_flag,
- mirror_num,
- &block_was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&next_block_ctx);
- return -1;
- }
- if (!block_was_created) {
- if ((state->print_mask &
- BTRFSIC_PRINT_MASK_VERBOSE) &&
- next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr)) {
- pr_info(
-"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
- next_bytenr,
- next_block_ctx.dev->bdev,
- next_block_ctx.dev_bytenr,
- mirror_num,
- next_block->logical_bytenr);
- }
- next_block->logical_bytenr = next_bytenr;
- next_block->mirror_num = mirror_num;
- }
-
- l = btrfsic_block_link_lookup_or_add(state,
- &next_block_ctx,
- next_block, block,
- generation);
- btrfsic_release_block_ctx(&next_block_ctx);
- if (NULL == l)
- return -1;
- }
-
- next_bytenr += chunk_len;
- num_bytes -= chunk_len;
- }
-
- return 0;
-}
-
-static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
- struct btrfsic_block_data_ctx *block_ctx_out,
- int mirror_num)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- int ret;
- u64 length;
- struct btrfs_io_context *bioc = NULL;
- struct btrfs_io_stripe smap, *map;
- struct btrfs_device *device;
-
- length = len;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc,
- NULL, &mirror_num, 0);
- if (ret) {
- block_ctx_out->start = 0;
- block_ctx_out->dev_bytenr = 0;
- block_ctx_out->len = 0;
- block_ctx_out->dev = NULL;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
-
- return ret;
- }
-
- if (bioc)
- map = &bioc->stripes[0];
- else
- map = &smap;
-
- device = map->dev;
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
- !device->bdev || !device->name)
- block_ctx_out->dev = NULL;
- else
- block_ctx_out->dev = btrfsic_dev_state_lookup(
- device->bdev->bd_dev);
- block_ctx_out->dev_bytenr = map->physical;
- block_ctx_out->start = bytenr;
- block_ctx_out->len = len;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
-
- kfree(bioc);
- if (NULL == block_ctx_out->dev) {
- ret = -ENXIO;
- pr_info("btrfsic: error, cannot lookup dev (#1)!\n");
- }
-
- return ret;
-}
-
-static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
-{
- if (block_ctx->mem_to_free) {
- unsigned int num_pages;
-
- BUG_ON(!block_ctx->datav);
- BUG_ON(!block_ctx->pagev);
- num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- /* Pages must be unmapped in reverse order */
- while (num_pages > 0) {
- num_pages--;
- if (block_ctx->datav[num_pages])
- block_ctx->datav[num_pages] = NULL;
- if (block_ctx->pagev[num_pages]) {
- __free_page(block_ctx->pagev[num_pages]);
- block_ctx->pagev[num_pages] = NULL;
- }
- }
-
- kfree(block_ctx->mem_to_free);
- block_ctx->mem_to_free = NULL;
- block_ctx->pagev = NULL;
- block_ctx->datav = NULL;
- }
-}
-
-static int btrfsic_read_block(struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx)
-{
- unsigned int num_pages;
- unsigned int i;
- size_t size;
- u64 dev_bytenr;
- int ret;
-
- BUG_ON(block_ctx->datav);
- BUG_ON(block_ctx->pagev);
- BUG_ON(block_ctx->mem_to_free);
- if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) {
- pr_info("btrfsic: read_block() with unaligned bytenr %llu\n",
- block_ctx->dev_bytenr);
- return -1;
- }
-
- num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
- PAGE_SHIFT;
- size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
- block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
- if (!block_ctx->mem_to_free)
- return -ENOMEM;
- block_ctx->datav = block_ctx->mem_to_free;
- block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
- if (ret)
- return ret;
-
- dev_bytenr = block_ctx->dev_bytenr;
- for (i = 0; i < num_pages;) {
- struct bio *bio;
- unsigned int j;
-
- bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
- REQ_OP_READ, GFP_NOFS);
- bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT;
-
- for (j = i; j < num_pages; j++) {
- ret = bio_add_page(bio, block_ctx->pagev[j],
- PAGE_SIZE, 0);
- if (PAGE_SIZE != ret)
- break;
- }
- if (j == i) {
- pr_info("btrfsic: error, failed to add a single page!\n");
- return -1;
- }
- if (submit_bio_wait(bio)) {
- pr_info("btrfsic: read error at logical %llu dev %pg!\n",
- block_ctx->start, block_ctx->dev->bdev);
- bio_put(bio);
- return -1;
- }
- bio_put(bio);
- dev_bytenr += (j - i) * PAGE_SIZE;
- i = j;
- }
- for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = page_address(block_ctx->pagev[i]);
-
- return block_ctx->len;
-}
-
-static void btrfsic_dump_database(struct btrfsic_state *state)
-{
- const struct btrfsic_block *b_all;
-
- BUG_ON(NULL == state);
-
- pr_info("all_blocks_list:\n");
- list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
- const struct btrfsic_block_link *l;
-
- pr_info("%c-block @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num);
-
- list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
- pr_info(
- " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- }
-
- list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
- pr_info(
- " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr,
- l->block_ref_from->mirror_num);
- }
-
- pr_info("\n");
- }
-}
-
-/*
- * Test whether the disk block contains a tree block (leaf or node)
- * (note that this test fails for the super block)
- */
-static noinline_for_stack int btrfsic_test_for_metadata(
- struct btrfsic_state *state,
- char **datav, unsigned int num_pages)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- struct btrfs_header *h;
- u8 csum[BTRFS_CSUM_SIZE];
- unsigned int i;
-
- if (num_pages * PAGE_SIZE < state->metablock_size)
- return 1; /* not metadata */
- num_pages = state->metablock_size >> PAGE_SHIFT;
- h = (struct btrfs_header *)datav[0];
-
- if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
- return 1;
-
- shash->tfm = fs_info->csum_shash;
- crypto_shash_init(shash);
-
- for (i = 0; i < num_pages; i++) {
- u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
- size_t sublen = i ? PAGE_SIZE :
- (PAGE_SIZE - BTRFS_CSUM_SIZE);
-
- crypto_shash_update(shash, data, sublen);
- }
- crypto_shash_final(shash, csum);
- if (memcmp(csum, h->csum, fs_info->csum_size))
- return 1;
-
- return 0; /* is metadata */
-}
-
-static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr, char **mapped_datav,
- unsigned int num_pages,
- struct bio *bio, int *bio_is_patched,
- blk_opf_t submit_bio_bh_rw)
-{
- int is_metadata;
- struct btrfsic_block *block;
- struct btrfsic_block_data_ctx block_ctx;
- int ret;
- struct btrfsic_state *state = dev_state->state;
- struct block_device *bdev = dev_state->bdev;
- unsigned int processed_len;
-
- if (NULL != bio_is_patched)
- *bio_is_patched = 0;
-
-again:
- if (num_pages == 0)
- return;
-
- processed_len = 0;
- is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
- num_pages));
-
- block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
- &state->block_hashtable);
- if (NULL != block) {
- u64 bytenr = 0;
- struct btrfsic_block_link *l, *tmp;
-
- if (block->is_superblock) {
- bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
- mapped_datav[0]);
- if (num_pages * PAGE_SIZE <
- BTRFS_SUPER_INFO_SIZE) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- is_metadata = 1;
- BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE));
- processed_len = BTRFS_SUPER_INFO_SIZE;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
- pr_info("[before new superblock is written]:\n");
- btrfsic_dump_tree_sub(state, block, 0);
- }
- }
- if (is_metadata) {
- if (!block->is_superblock) {
- if (num_pages * PAGE_SIZE <
- state->metablock_size) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- processed_len = state->metablock_size;
- bytenr = btrfs_stack_header_bytenr(
- (struct btrfs_header *)
- mapped_datav[0]);
- btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
- dev_state,
- dev_bytenr);
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
- if (block->logical_bytenr != bytenr &&
- !(!block->is_metadata &&
- block->logical_bytenr == 0))
- pr_info(
-"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
- bytenr, dev_state->bdev,
- dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state,
- block),
- block->logical_bytenr);
- else
- pr_info(
- "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- bytenr, dev_state->bdev,
- dev_bytenr, block->mirror_num,
- btrfsic_get_block_type(state,
- block));
- }
- block->logical_bytenr = bytenr;
- } else {
- if (num_pages * PAGE_SIZE <
- state->datablock_size) {
- pr_info("btrfsic: cannot work with too short bios!\n");
- return;
- }
- processed_len = state->datablock_size;
- bytenr = block->logical_bytenr;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
- bytenr, dev_state->bdev, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block));
- }
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("ref_to_list: %cE, ref_from_list: %cE\n",
- list_empty(&block->ref_to_list) ? ' ' : '!',
- list_empty(&block->ref_from_list) ? ' ' : '!');
- if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
- pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
- btrfsic_get_block_type(state, block), bytenr,
- dev_state->bdev, dev_bytenr, block->mirror_num,
- block->generation,
- btrfs_disk_key_objectid(&block->disk_key),
- block->disk_key.type,
- btrfs_disk_key_offset(&block->disk_key),
- btrfs_stack_header_generation(
- (struct btrfs_header *) mapped_datav[0]),
- state->max_superblock_generation);
- btrfsic_dump_tree(state);
- }
-
- if (!block->is_iodone && !block->never_written) {
- pr_info(
-"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
- btrfsic_get_block_type(state, block), bytenr,
- dev_state->bdev, dev_bytenr, block->mirror_num,
- block->generation,
- btrfs_stack_header_generation(
- (struct btrfs_header *)
- mapped_datav[0]));
- /* it would not be safe to go on */
- btrfsic_dump_tree(state);
- goto continue_loop;
- }
-
- /*
- * Clear all references of this block. Do not free
- * the block itself even if is not referenced anymore
- * because it still carries valuable information
- * like whether it was ever written and IO completed.
- */
- list_for_each_entry_safe(l, tmp, &block->ref_to_list,
- node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_rem_link(state, l);
- l->ref_cnt--;
- if (0 == l->ref_cnt) {
- list_del(&l->node_ref_to);
- list_del(&l->node_ref_from);
- btrfsic_block_link_hashtable_remove(l);
- btrfsic_block_link_free(l);
- }
- }
-
- block_ctx.dev = dev_state;
- block_ctx.dev_bytenr = dev_bytenr;
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.pagev = NULL;
- block_ctx.mem_to_free = NULL;
- block_ctx.datav = mapped_datav;
-
- if (is_metadata || state->include_extent_data) {
- block->never_written = 0;
- block->iodone_w_error = 0;
- if (NULL != bio) {
- block->is_iodone = 0;
- BUG_ON(NULL == bio_is_patched);
- if (!*bio_is_patched) {
- block->orig_bio_private =
- bio->bi_private;
- block->orig_bio_end_io =
- bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- *bio_is_patched = 1;
- } else {
- struct btrfsic_block *chained_block =
- (struct btrfsic_block *)
- bio->bi_private;
-
- BUG_ON(NULL == chained_block);
- block->orig_bio_private =
- chained_block->orig_bio_private;
- block->orig_bio_end_io =
- chained_block->orig_bio_end_io;
- block->next_in_same_bio = chained_block;
- bio->bi_private = block;
- }
- } else {
- block->is_iodone = 1;
- block->orig_bio_private = NULL;
- block->orig_bio_end_io = NULL;
- block->next_in_same_bio = NULL;
- }
- }
-
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = submit_bio_bh_rw;
- if (is_metadata) {
- block->logical_bytenr = bytenr;
- block->is_metadata = 1;
- if (block->is_superblock) {
- BUG_ON(PAGE_SIZE !=
- BTRFS_SUPER_INFO_SIZE);
- ret = btrfsic_process_written_superblock(
- state,
- block,
- (struct btrfs_super_block *)
- mapped_datav[0]);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
- pr_info("[after new superblock is written]:\n");
- btrfsic_dump_tree_sub(state, block, 0);
- }
- } else {
- block->mirror_num = 0; /* unknown */
- ret = btrfsic_process_metablock(
- state,
- block,
- &block_ctx,
- 0, 0);
- }
- if (ret)
- pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n",
- dev_bytenr);
- } else {
- block->is_metadata = 0;
- block->mirror_num = 0; /* unknown */
- block->generation = BTRFSIC_GENERATION_UNKNOWN;
- if (!state->include_extent_data
- && list_empty(&block->ref_from_list)) {
- /*
- * disk block is overwritten with extent
- * data (not meta data) and we are configured
- * to not include extent data: take the
- * chance and free the block's memory
- */
- btrfsic_block_hashtable_remove(block);
- list_del(&block->all_blocks_node);
- btrfsic_block_free(block);
- }
- }
- btrfsic_release_block_ctx(&block_ctx);
- } else {
- /* block has not been found in hash table */
- u64 bytenr;
-
- if (!is_metadata) {
- processed_len = state->datablock_size;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block (%pg/%llu/?) !found in hash table, D\n",
- dev_state->bdev, dev_bytenr);
- if (!state->include_extent_data) {
- /* ignore that written D block */
- goto continue_loop;
- }
-
- /* this is getting ugly for the
- * include_extent_data case... */
- bytenr = 0; /* unknown */
- } else {
- processed_len = state->metablock_size;
- bytenr = btrfs_stack_header_bytenr(
- (struct btrfs_header *)
- mapped_datav[0]);
- btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
- dev_bytenr);
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
- bytenr, dev_state->bdev, dev_bytenr);
- }
-
- block_ctx.dev = dev_state;
- block_ctx.dev_bytenr = dev_bytenr;
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.pagev = NULL;
- block_ctx.mem_to_free = NULL;
- block_ctx.datav = mapped_datav;
-
- block = btrfsic_block_alloc();
- if (NULL == block) {
- btrfsic_release_block_ctx(&block_ctx);
- goto continue_loop;
- }
- block->dev_state = dev_state;
- block->dev_bytenr = dev_bytenr;
- block->logical_bytenr = bytenr;
- block->is_metadata = is_metadata;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->mirror_num = 0; /* unknown */
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = submit_bio_bh_rw;
- if (NULL != bio) {
- block->is_iodone = 0;
- BUG_ON(NULL == bio_is_patched);
- if (!*bio_is_patched) {
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- *bio_is_patched = 1;
- } else {
- struct btrfsic_block *chained_block =
- (struct btrfsic_block *)
- bio->bi_private;
-
- BUG_ON(NULL == chained_block);
- block->orig_bio_private =
- chained_block->orig_bio_private;
- block->orig_bio_end_io =
- chained_block->orig_bio_end_io;
- block->next_in_same_bio = chained_block;
- bio->bi_private = block;
- }
- } else {
- block->is_iodone = 1;
- block->orig_bio_private = NULL;
- block->orig_bio_end_io = NULL;
- block->next_in_same_bio = NULL;
- }
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
- is_metadata ? 'M' : 'D',
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- list_add(&block->all_blocks_node, &state->all_blocks_list);
- btrfsic_block_hashtable_add(block, &state->block_hashtable);
-
- if (is_metadata) {
- ret = btrfsic_process_metablock(state, block,
- &block_ctx, 0, 0);
- if (ret)
- pr_info("btrfsic: process_metablock(root @%llu) failed!\n",
- dev_bytenr);
- }
- btrfsic_release_block_ctx(&block_ctx);
- }
-
-continue_loop:
- BUG_ON(!processed_len);
- dev_bytenr += processed_len;
- mapped_datav += processed_len >> PAGE_SHIFT;
- num_pages -= processed_len >> PAGE_SHIFT;
- goto again;
-}
-
-static void btrfsic_bio_end_io(struct bio *bp)
-{
- struct btrfsic_block *block = bp->bi_private;
- int iodone_w_error;
-
- /* mutex is not held! This is not save if IO is not yet completed
- * on umount */
- iodone_w_error = 0;
- if (bp->bi_status)
- iodone_w_error = 1;
-
- BUG_ON(NULL == block);
- bp->bi_private = block->orig_bio_private;
- bp->bi_end_io = block->orig_bio_end_io;
-
- do {
- struct btrfsic_block *next_block;
- struct btrfsic_dev_state *const dev_state = block->dev_state;
-
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
- bp->bi_status,
- btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- next_block = block->next_in_same_bio;
- block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
- dev_state->last_flush_gen++;
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bio_end_io() new %pg flush_gen=%llu\n",
- dev_state->bdev,
- dev_state->last_flush_gen);
- }
- if (block->submit_bio_bh_rw & REQ_FUA)
- block->flush_gen = 0; /* FUA completed means block is
- * on disk */
- block->is_iodone = 1; /* for FLUSH, this releases the block */
- block = next_block;
- } while (NULL != block);
-
- bp->bi_end_io(bp);
-}
-
-static int btrfsic_process_written_superblock(
- struct btrfsic_state *state,
- struct btrfsic_block *const superblock,
- struct btrfs_super_block *const super_hdr)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- int pass;
-
- superblock->generation = btrfs_super_generation(super_hdr);
- if (!(superblock->generation > state->max_superblock_generation ||
- 0 == state->max_superblock_generation)) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info(
- "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
- superblock->logical_bytenr,
- superblock->dev_state->bdev,
- superblock->dev_bytenr, superblock->mirror_num,
- btrfs_super_generation(super_hdr),
- state->max_superblock_generation);
- } else {
- if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- pr_info(
- "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
- superblock->logical_bytenr,
- superblock->dev_state->bdev,
- superblock->dev_bytenr, superblock->mirror_num,
- btrfs_super_generation(super_hdr),
- state->max_superblock_generation);
-
- state->max_superblock_generation =
- btrfs_super_generation(super_hdr);
- state->latest_superblock = superblock;
- }
-
- for (pass = 0; pass < 3; pass++) {
- int ret;
- u64 next_bytenr;
- struct btrfsic_block *next_block;
- struct btrfsic_block_data_ctx tmp_next_block_ctx;
- struct btrfsic_block_link *l;
- int num_copies;
- int mirror_num;
- const char *additional_string = NULL;
- struct btrfs_disk_key tmp_disk_key = {0};
-
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_ITEM_KEY);
- btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
-
- switch (pass) {
- case 0:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_ROOT_TREE_OBJECTID);
- additional_string = "root ";
- next_bytenr = btrfs_super_root(super_hdr);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("root@%llu\n", next_bytenr);
- break;
- case 1:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_CHUNK_TREE_OBJECTID);
- additional_string = "chunk ";
- next_bytenr = btrfs_super_chunk_root(super_hdr);
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("chunk@%llu\n", next_bytenr);
- break;
- case 2:
- btrfs_set_disk_key_objectid(&tmp_disk_key,
- BTRFS_TREE_LOG_OBJECTID);
- additional_string = "log ";
- next_bytenr = btrfs_super_log_root(super_hdr);
- if (0 == next_bytenr)
- continue;
- if (state->print_mask &
- BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
- pr_info("log@%llu\n", next_bytenr);
- break;
- }
-
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
- BTRFS_SUPER_INFO_SIZE);
- if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
- pr_info("num_copies(log_bytenr=%llu) = %d\n",
- next_bytenr, num_copies);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- int was_created;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num);
- ret = btrfsic_map_block(state, next_bytenr,
- BTRFS_SUPER_INFO_SIZE,
- &tmp_next_block_ctx,
- mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
- next_bytenr, mirror_num);
- return -1;
- }
-
- next_block = btrfsic_block_lookup_or_add(
- state,
- &tmp_next_block_ctx,
- additional_string,
- 1, 0, 1,
- mirror_num,
- &was_created);
- if (NULL == next_block) {
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- return -1;
- }
-
- next_block->disk_key = tmp_disk_key;
- if (was_created)
- next_block->generation =
- BTRFSIC_GENERATION_UNKNOWN;
- l = btrfsic_block_link_lookup_or_add(
- state,
- &tmp_next_block_ctx,
- next_block,
- superblock,
- BTRFSIC_GENERATION_UNKNOWN);
- btrfsic_release_block_ctx(&tmp_next_block_ctx);
- if (NULL == l)
- return -1;
- }
- }
-
- if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
- btrfsic_dump_tree(state);
-
- return 0;
-}
-
-static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
- struct btrfsic_block *const block,
- int recursion_level)
-{
- const struct btrfsic_block_link *l;
- int ret = 0;
-
- if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
- /*
- * Note that this situation can happen and does not
- * indicate an error in regular cases. It happens
- * when disk blocks are freed and later reused.
- * The check-integrity module is not aware of any
- * block free operations, it just recognizes block
- * write operations. Therefore it keeps the linkage
- * information for a block until a block is
- * rewritten. This can temporarily cause incorrect
- * and even circular linkage information. This
- * causes no harm unless such blocks are referenced
- * by the most recent super block.
- */
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic: abort cyclic linkage (case 1).\n");
-
- return ret;
- }
-
- /*
- * This algorithm is recursive because the amount of used stack
- * space is very small and the max recursion depth is limited.
- */
- list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
- recursion_level,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- if (l->block_ref_to->never_written) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (!l->block_ref_to->is_iodone) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (l->block_ref_to->iodone_w_error) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
- ret = -1;
- } else if (l->parent_generation !=
- l->block_ref_to->generation &&
- BTRFSIC_GENERATION_UNKNOWN !=
- l->parent_generation &&
- BTRFSIC_GENERATION_UNKNOWN !=
- l->block_ref_to->generation) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num,
- l->block_ref_to->generation,
- l->parent_generation);
- ret = -1;
- } else if (l->block_ref_to->flush_gen >
- l->block_ref_to->dev_state->last_flush_gen) {
- pr_info(
-"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev,
- l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num, block->flush_gen,
- l->block_ref_to->dev_state->last_flush_gen);
- ret = -1;
- } else if (-1 == btrfsic_check_all_ref_blocks(state,
- l->block_ref_to,
- recursion_level +
- 1)) {
- ret = -1;
- }
- }
-
- return ret;
-}
-
-static int btrfsic_is_block_ref_by_superblock(
- const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int recursion_level)
-{
- const struct btrfsic_block_link *l;
-
- if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
- /* refer to comment at "abort cyclic linkage (case 1)" */
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("btrfsic: abort cyclic linkage (case 2).\n");
-
- return 0;
- }
-
- /*
- * This algorithm is recursive because the amount of used stack space
- * is very small and the max recursion depth is limited.
- */
- list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info(
- "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
- recursion_level,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num,
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr,
- l->block_ref_from->mirror_num);
- if (l->block_ref_from->is_superblock &&
- state->latest_superblock->dev_bytenr ==
- l->block_ref_from->dev_bytenr &&
- state->latest_superblock->dev_state->bdev ==
- l->block_ref_from->dev_state->bdev)
- return 1;
- else if (btrfsic_is_block_ref_by_superblock(state,
- l->block_ref_from,
- recursion_level +
- 1))
- return 1;
- }
-
- return 0;
-}
-
-static void btrfsic_print_add_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l)
-{
- pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
-}
-
-static void btrfsic_print_rem_link(const struct btrfsic_state *state,
- const struct btrfsic_block_link *l)
-{
- pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
- l->ref_cnt,
- btrfsic_get_block_type(state, l->block_ref_from),
- l->block_ref_from->logical_bytenr,
- l->block_ref_from->dev_state->bdev,
- l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
- btrfsic_get_block_type(state, l->block_ref_to),
- l->block_ref_to->logical_bytenr,
- l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
- l->block_ref_to->mirror_num);
-}
-
-static char btrfsic_get_block_type(const struct btrfsic_state *state,
- const struct btrfsic_block *block)
-{
- if (block->is_superblock &&
- state->latest_superblock->dev_bytenr == block->dev_bytenr &&
- state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
- return 'S';
- else if (block->is_superblock)
- return 's';
- else if (block->is_metadata)
- return 'M';
- else
- return 'D';
-}
-
-static void btrfsic_dump_tree(const struct btrfsic_state *state)
-{
- btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
-}
-
-static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
- const struct btrfsic_block *block,
- int indent_level)
-{
- const struct btrfsic_block_link *l;
- int indent_add;
- static char buf[80];
- int cursor_position;
-
- /*
- * Should better fill an on-stack buffer with a complete line and
- * dump it at once when it is time to print a newline character.
- */
-
- /*
- * This algorithm is recursive because the amount of used stack space
- * is very small and the max recursion depth is limited.
- */
- indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, block->dev_state->bdev,
- block->dev_bytenr, block->mirror_num);
- if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
- printk("[...]\n");
- return;
- }
- printk(buf);
- indent_level += indent_add;
- if (list_empty(&block->ref_to_list)) {
- printk("\n");
- return;
- }
- if (block->mirror_num > 1 &&
- !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
- printk(" [...]\n");
- return;
- }
-
- cursor_position = indent_level;
- list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
- while (cursor_position < indent_level) {
- printk(" ");
- cursor_position++;
- }
- if (l->ref_cnt > 1)
- indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
- else
- indent_add = sprintf(buf, " --> ");
- if (indent_level + indent_add >
- BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
- printk("[...]\n");
- cursor_position = 0;
- continue;
- }
-
- printk(buf);
-
- btrfsic_dump_tree_sub(state, l->block_ref_to,
- indent_level + indent_add);
- cursor_position = 0;
- }
-}
-
-static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *next_block_ctx,
- struct btrfsic_block *next_block,
- struct btrfsic_block *from_block,
- u64 parent_generation)
-{
- struct btrfsic_block_link *l;
-
- l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
- next_block_ctx->dev_bytenr,
- from_block->dev_state->bdev,
- from_block->dev_bytenr,
- &state->block_link_hashtable);
- if (NULL == l) {
- l = btrfsic_block_link_alloc();
- if (!l)
- return NULL;
-
- l->block_ref_to = next_block;
- l->block_ref_from = from_block;
- l->ref_cnt = 1;
- l->parent_generation = parent_generation;
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
-
- list_add(&l->node_ref_to, &from_block->ref_to_list);
- list_add(&l->node_ref_from, &next_block->ref_from_list);
-
- btrfsic_block_link_hashtable_add(l,
- &state->block_link_hashtable);
- } else {
- l->ref_cnt++;
- l->parent_generation = parent_generation;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_add_link(state, l);
- }
-
- return l;
-}
-
-static struct btrfsic_block *btrfsic_block_lookup_or_add(
- struct btrfsic_state *state,
- struct btrfsic_block_data_ctx *block_ctx,
- const char *additional_string,
- int is_metadata,
- int is_iodone,
- int never_written,
- int mirror_num,
- int *was_created)
-{
- struct btrfsic_block *block;
-
- block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
- block_ctx->dev_bytenr,
- &state->block_hashtable);
- if (NULL == block) {
- struct btrfsic_dev_state *dev_state;
-
- block = btrfsic_block_alloc();
- if (!block)
- return NULL;
-
- dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev);
- if (NULL == dev_state) {
- pr_info("btrfsic: error, lookup dev_state failed!\n");
- btrfsic_block_free(block);
- return NULL;
- }
- block->dev_state = dev_state;
- block->dev_bytenr = block_ctx->dev_bytenr;
- block->logical_bytenr = block_ctx->start;
- block->is_metadata = is_metadata;
- block->is_iodone = is_iodone;
- block->never_written = never_written;
- block->mirror_num = mirror_num;
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
- additional_string,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr, dev_state->bdev,
- block->dev_bytenr, mirror_num);
- list_add(&block->all_blocks_node, &state->all_blocks_list);
- btrfsic_block_hashtable_add(block, &state->block_hashtable);
- if (NULL != was_created)
- *was_created = 1;
- } else {
- if (NULL != was_created)
- *was_created = 0;
- }
-
- return block;
-}
-
-static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
- u64 bytenr,
- struct btrfsic_dev_state *dev_state,
- u64 dev_bytenr)
-{
- struct btrfs_fs_info *fs_info = state->fs_info;
- struct btrfsic_block_data_ctx block_ctx;
- int num_copies;
- int mirror_num;
- int match = 0;
- int ret;
-
- num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size);
-
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr, state->metablock_size,
- &block_ctx, mirror_num);
- if (ret) {
- pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n",
- bytenr, mirror_num);
- continue;
- }
-
- if (dev_state->bdev == block_ctx.dev->bdev &&
- dev_bytenr == block_ctx.dev_bytenr) {
- match++;
- btrfsic_release_block_ctx(&block_ctx);
- break;
- }
- btrfsic_release_block_ctx(&block_ctx);
- }
-
- if (WARN_ON(!match)) {
- pr_info(
-"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
- bytenr, dev_state->bdev, dev_bytenr);
- for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
- ret = btrfsic_map_block(state, bytenr,
- state->metablock_size,
- &block_ctx, mirror_num);
- if (ret)
- continue;
-
- pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
- bytenr, block_ctx.dev->bdev,
- block_ctx.dev_bytenr, mirror_num);
- }
- }
-}
-
-static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
-{
- return btrfsic_dev_state_hashtable_lookup(dev,
- &btrfsic_dev_state_hashtable);
-}
-
-static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
- unsigned int segs = bio_segments(bio);
- u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
- u64 cur_bytenr = dev_bytenr;
- struct bvec_iter iter;
- struct bio_vec bvec;
- char **mapped_datav;
- int bio_is_patched = 0;
- int i = 0;
-
- if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info(
-"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
- mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- return;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = page_address(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
-
- btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
- bio, &bio_is_patched, bio->bi_opf);
- kfree(mapped_datav);
-}
-
-static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
-{
- if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_bdev);
-
- if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
-
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- } else if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE))) {
- pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->bdev);
- }
-}
-
-void btrfsic_check_bio(struct bio *bio)
-{
- struct btrfsic_dev_state *dev_state;
-
- if (!btrfsic_is_initialized)
- return;
-
- /*
- * We can be called before btrfsic_mount, so there might not be a
- * dev_state.
- */
- dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
- mutex_lock(&btrfsic_mutex);
- if (dev_state) {
- if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
- btrfsic_check_write_bio(bio, dev_state);
- else if (bio->bi_opf & REQ_PREFLUSH)
- btrfsic_check_flush_bio(bio, dev_state);
- }
- mutex_unlock(&btrfsic_mutex);
-}
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices,
- int including_extent_data, u32 print_mask)
-{
- int ret;
- struct btrfsic_state *state;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
-
- if (!PAGE_ALIGNED(fs_info->nodesize)) {
- pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
- fs_info->nodesize, PAGE_SIZE);
- return -1;
- }
- if (!PAGE_ALIGNED(fs_info->sectorsize)) {
- pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
- fs_info->sectorsize, PAGE_SIZE);
- return -1;
- }
- state = kvzalloc(sizeof(*state), GFP_KERNEL);
- if (!state)
- return -ENOMEM;
-
- if (!btrfsic_is_initialized) {
- mutex_init(&btrfsic_mutex);
- btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
- btrfsic_is_initialized = 1;
- }
- mutex_lock(&btrfsic_mutex);
- state->fs_info = fs_info;
- state->print_mask = print_mask;
- state->include_extent_data = including_extent_data;
- state->metablock_size = fs_info->nodesize;
- state->datablock_size = fs_info->sectorsize;
- INIT_LIST_HEAD(&state->all_blocks_list);
- btrfsic_block_hashtable_init(&state->block_hashtable);
- btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
- state->max_superblock_generation = 0;
- state->latest_superblock = NULL;
-
- list_for_each_entry(device, dev_head, dev_list) {
- struct btrfsic_dev_state *ds;
-
- if (!device->bdev || !device->name)
- continue;
-
- ds = btrfsic_dev_state_alloc();
- if (NULL == ds) {
- mutex_unlock(&btrfsic_mutex);
- return -ENOMEM;
- }
- ds->bdev = device->bdev;
- ds->state = state;
- btrfsic_dev_state_hashtable_add(ds,
- &btrfsic_dev_state_hashtable);
- }
-
- ret = btrfsic_process_superblock(state, fs_devices);
- if (0 != ret) {
- mutex_unlock(&btrfsic_mutex);
- btrfsic_unmount(fs_devices);
- return ret;
- }
-
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
- btrfsic_dump_database(state);
- if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
- btrfsic_dump_tree(state);
-
- mutex_unlock(&btrfsic_mutex);
- return 0;
-}
-
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
-{
- struct btrfsic_block *b_all, *tmp_all;
- struct btrfsic_state *state;
- struct list_head *dev_head = &fs_devices->devices;
- struct btrfs_device *device;
-
- if (!btrfsic_is_initialized)
- return;
-
- mutex_lock(&btrfsic_mutex);
-
- state = NULL;
- list_for_each_entry(device, dev_head, dev_list) {
- struct btrfsic_dev_state *ds;
-
- if (!device->bdev || !device->name)
- continue;
-
- ds = btrfsic_dev_state_hashtable_lookup(
- device->bdev->bd_dev,
- &btrfsic_dev_state_hashtable);
- if (NULL != ds) {
- state = ds->state;
- btrfsic_dev_state_hashtable_remove(ds);
- btrfsic_dev_state_free(ds);
- }
- }
-
- if (NULL == state) {
- pr_info("btrfsic: error, cannot find state information on umount!\n");
- mutex_unlock(&btrfsic_mutex);
- return;
- }
-
- /*
- * Don't care about keeping the lists' state up to date,
- * just free all memory that was allocated dynamically.
- * Free the blocks and the block_links.
- */
- list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
- all_blocks_node) {
- struct btrfsic_block_link *l, *tmp;
-
- list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
- node_ref_to) {
- if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- btrfsic_print_rem_link(state, l);
-
- l->ref_cnt--;
- if (0 == l->ref_cnt)
- btrfsic_block_link_free(l);
- }
-
- if (b_all->is_iodone || b_all->never_written)
- btrfsic_block_free(b_all);
- else
- pr_info(
-"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
- btrfsic_get_block_type(state, b_all),
- b_all->logical_bytenr, b_all->dev_state->bdev,
- b_all->dev_bytenr, b_all->mirror_num);
- }
-
- mutex_unlock(&btrfsic_mutex);
-
- kvfree(state);
-}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
deleted file mode 100644
index e4c8aed7996f..000000000000
--- a/fs/btrfs/check-integrity.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) STRATO AG 2011. All rights reserved.
- */
-
-#ifndef BTRFS_CHECK_INTEGRITY_H
-#define BTRFS_CHECK_INTEGRITY_H
-
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_check_bio(struct bio *bio);
-#else
-static inline void btrfsic_check_bio(struct bio *bio) { }
-#endif
-
-int btrfsic_mount(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices,
- int including_extent_data, u32 print_mask);
-void btrfsic_unmount(struct btrfs_fs_devices *fs_devices);
-
-#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8818ed5c390f..19b22b4653c8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -193,12 +193,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
- const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
+ const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
int i;
int ret;
- if (errno)
- mapping_set_error(inode->i_mapping, errno);
+ if (error)
+ mapping_set_error(inode->i_mapping, error);
folio_batch_init(&fbatch);
while (index <= end_index) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a4cb4b642987..2a9344a3fcee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p)
* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
* caused by external factors.
*/
-bool __cold abort_should_print_stack(int errno)
+bool __cold abort_should_print_stack(int error)
{
- switch (errno) {
+ switch (error) {
case -EIO:
case -EROFS:
case -ENOMEM:
@@ -316,6 +316,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
+ u64 reloc_src_root = 0;
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
@@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ reloc_src_root = btrfs_header_owner(buf);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
&disk_key, level, buf->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ reloc_src_root, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -359,7 +362,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return ret;
}
- btrfs_mark_buffer_dirty(cow);
+ btrfs_mark_buffer_dirty(trans, cow);
*cow_ret = cow;
return 0;
}
@@ -367,7 +370,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-int btrfs_block_can_be_shared(struct btrfs_root *root,
+int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct extent_buffer *buf)
{
/*
@@ -376,11 +380,21 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
* not allocated by tree relocation, we know the block is not shared.
*/
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- buf != root->node && buf != root->commit_root &&
+ buf != root->node &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
- return 1;
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
+ if (buf != root->commit_root)
+ return 1;
+ /*
+ * An extent buffer that used to be the commit root may still be
+ * shared because the tree height may have increased and it
+ * became a child of a higher level root. This can happen when
+ * snapshotting a subvolume created in the current transaction.
+ */
+ if (btrfs_header_generation(buf) == trans->transid)
+ return 1;
+ }
return 0;
}
@@ -415,7 +429,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
* are only allowed for blocks use full backrefs.
*/
- if (btrfs_block_can_be_shared(root, buf)) {
+ if (btrfs_block_can_be_shared(trans, root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
&refs, &flags);
@@ -507,13 +521,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
*/
-static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf,
- struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size,
- enum btrfs_lock_nesting nest)
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -522,6 +536,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
int last_ref = 0;
int unlock_orig = 0;
u64 parent_start = 0;
+ u64 reloc_src_root = 0;
if (*cow_ret == buf)
unlock_orig = 1;
@@ -540,12 +555,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_node_key(buf, &disk_key, 0);
- if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
- parent_start = parent->start;
-
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent)
+ parent_start = parent->start;
+ reloc_src_root = btrfs_header_owner(buf);
+ }
cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
- search_start, empty_size, nest);
+ search_start, empty_size, reloc_src_root, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -616,7 +633,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
@@ -632,7 +649,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (unlock_orig)
btrfs_tree_unlock(buf);
free_extent_buffer_stale(buf);
- btrfs_mark_buffer_dirty(cow);
+ btrfs_mark_buffer_dirty(trans, cow);
*cow_ret = cow;
return 0;
}
@@ -668,11 +685,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
}
/*
- * cows a single block, see __btrfs_cow_block for the real work.
+ * COWs a single block, see btrfs_force_cow_block() for the real work.
* This version of it has extra checks so that a block isn't COWed more than
* once per transaction, as long as it hasn't been written yet
*/
-noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
@@ -682,25 +699,37 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
u64 search_start;
int ret;
- if (test_bit(BTRFS_ROOT_DELETING, &root->state))
- btrfs_err(fs_info,
- "COW'ing blocks on a fs root that's being dropped");
-
- if (trans->transaction != fs_info->running_transaction)
- WARN(1, KERN_CRIT "trans %llu running %llu\n",
- trans->transid,
- fs_info->running_transaction->transid);
+ if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+ "attempt to COW block %llu on root %llu that is being deleted",
+ buf->start, btrfs_root_id(root));
+ return -EUCLEAN;
+ }
- if (trans->transid != fs_info->generation)
- WARN(1, KERN_CRIT "trans %llu running %llu\n",
- trans->transid, fs_info->generation);
+ /*
+ * COWing must happen through a running transaction, which always
+ * matches the current fs generation (it's a transaction with a state
+ * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+ * into error state to prevent the commit of any transaction.
+ */
+ if (unlikely(trans->transaction != fs_info->running_transaction ||
+ trans->transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
+ buf->start, btrfs_root_id(root), trans->transid,
+ fs_info->running_transaction->transid,
+ fs_info->generation);
+ return -EUCLEAN;
+ }
if (!should_cow_block(trans, root, buf)) {
*cow_ret = buf;
return 0;
}
- search_start = buf->start & ~((u64)SZ_1G - 1);
+ search_start = round_down(buf->start, SZ_1G);
/*
* Before CoWing this block for later modification, check if it's
@@ -709,8 +738,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
- ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0, nest);
+ ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
+ cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -719,49 +748,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
- * helper function for defrag to decide if two blocks pointed to by a
- * node are actually close by
- */
-static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
-{
- if (blocknr < other && other - (blocknr + blocksize) < 32768)
- return 1;
- if (blocknr > other && blocknr - (other + blocksize) < 32768)
- return 1;
- return 0;
-}
-
-#ifdef __LITTLE_ENDIAN
-
-/*
- * Compare two keys, on little-endian the disk order is same as CPU order and
- * we can avoid the conversion.
- */
-static int comp_keys(const struct btrfs_disk_key *disk_key,
- const struct btrfs_key *k2)
-{
- const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
-
- return btrfs_comp_cpu_keys(k1, k2);
-}
-
-#else
-
-/*
- * compare two keys in a memcmp fashion
- */
-static int comp_keys(const struct btrfs_disk_key *disk,
- const struct btrfs_key *k2)
-{
- struct btrfs_key k1;
-
- btrfs_disk_key_to_cpu(&k1, disk);
-
- return btrfs_comp_cpu_keys(&k1, k2);
-}
-#endif
-
-/*
* same as comp_keys only with two btrfs_key's
*/
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
@@ -782,91 +768,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
}
/*
- * this is used by the defrag code to go through all the
- * leaves pointed to by a node and reallocate them so that
- * disk order is close to key order
- */
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, u64 *last_ret,
- struct btrfs_key *progress)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_buffer *cur;
- u64 blocknr;
- u64 search_start = *last_ret;
- u64 last_block = 0;
- u64 other;
- u32 parent_nritems;
- int end_slot;
- int i;
- int err = 0;
- u32 blocksize;
- int progress_passed = 0;
- struct btrfs_disk_key disk_key;
-
- WARN_ON(trans->transaction != fs_info->running_transaction);
- WARN_ON(trans->transid != fs_info->generation);
-
- parent_nritems = btrfs_header_nritems(parent);
- blocksize = fs_info->nodesize;
- end_slot = parent_nritems - 1;
-
- if (parent_nritems <= 1)
- return 0;
-
- for (i = start_slot; i <= end_slot; i++) {
- int close = 1;
-
- btrfs_node_key(parent, &disk_key, i);
- if (!progress_passed && comp_keys(&disk_key, progress) < 0)
- continue;
-
- progress_passed = 1;
- blocknr = btrfs_node_blockptr(parent, i);
- if (last_block == 0)
- last_block = blocknr;
-
- if (i > 0) {
- other = btrfs_node_blockptr(parent, i - 1);
- close = close_blocks(blocknr, other, blocksize);
- }
- if (!close && i < end_slot) {
- other = btrfs_node_blockptr(parent, i + 1);
- close = close_blocks(blocknr, other, blocksize);
- }
- if (close) {
- last_block = blocknr;
- continue;
- }
-
- cur = btrfs_read_node_slot(parent, i);
- if (IS_ERR(cur))
- return PTR_ERR(cur);
- if (search_start == 0)
- search_start = last_block;
-
- btrfs_tree_lock(cur);
- err = __btrfs_cow_block(trans, root, cur, parent, i,
- &cur, search_start,
- min(16 * blocksize,
- (end_slot - i) * blocksize),
- BTRFS_NESTING_COW);
- if (err) {
- btrfs_tree_unlock(cur);
- free_extent_buffer(cur);
- break;
- }
- search_start = cur->start;
- last_block = cur->start;
- *last_ret = search_start;
- btrfs_tree_unlock(cur);
- free_extent_buffer(cur);
- }
- return err;
-}
-
-/*
* Search for a key in the given extent_buffer.
*
* The lower boundary for the search is specified by the slot number @first_slot.
@@ -932,7 +833,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
tmp = &unaligned;
}
- ret = comp_keys(tmp, key);
+ ret = btrfs_comp_keys(tmp, key);
if (ret < 0)
low = mid + 1;
@@ -947,19 +848,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
return 1;
}
-static void root_add_used(struct btrfs_root *root, u32 size)
+static void root_add_used_bytes(struct btrfs_root *root)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
- btrfs_root_used(&root->root_item) + size);
+ btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
spin_unlock(&root->accounting_lock);
}
-static void root_sub_used(struct btrfs_root *root, u32 size)
+static void root_sub_used_bytes(struct btrfs_root *root)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
- btrfs_root_used(&root->root_item) - size);
+ btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
spin_unlock(&root->accounting_lock);
}
@@ -1075,7 +976,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* once for the path */
free_extent_buffer(mid);
- root_sub_used(root, mid->len);
+ root_sub_used_bytes(root);
btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
@@ -1145,7 +1046,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
goto out;
}
- root_sub_used(root, right->len);
+ root_sub_used_bytes(root);
btrfs_free_tree_block(trans, btrfs_root_id(root), right,
0, 1);
free_extent_buffer_stale(right);
@@ -1160,7 +1061,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_set_node_key(parent, &right_key, pslot + 1);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
}
}
if (btrfs_header_nritems(mid) == 1) {
@@ -1203,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = NULL;
goto out;
}
- root_sub_used(root, mid->len);
+ root_sub_used_bytes(root);
btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
@@ -1218,7 +1119,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_set_node_key(parent, &mid_key, pslot);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
}
/* update the path */
@@ -1325,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
return ret;
}
btrfs_set_node_key(parent, &disk_key, pslot);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
if (btrfs_header_nritems(left) > orig_slot) {
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -1385,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
return ret;
}
btrfs_set_node_key(parent, &disk_key, pslot + 1);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
if (btrfs_header_nritems(mid) <= orig_slot) {
path->nodes[level] = right;
@@ -1969,7 +1870,7 @@ static int search_leaf(struct btrfs_trans_handle *trans,
* the extent buffer's header and we have recently accessed
* the header's level field.
*/
- ret = comp_keys(&first_key, key);
+ ret = btrfs_comp_keys(&first_key, key);
if (ret < 0) {
/*
* The first key is smaller than the key we want
@@ -2054,8 +1955,8 @@ static int search_leaf(struct btrfs_trans_handle *trans,
}
/*
- * btrfs_search_slot - look for a key in a tree and perform necessary
- * modifications to preserve tree invariants.
+ * Look for a key in a tree and perform necessary modifications to preserve
+ * tree invariants.
*
* @trans: Handle of transaction, used when modifying the tree
* @p: Holds all btree nodes along the search path
@@ -2478,7 +2379,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
*/
if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
- ret = comp_keys(&found_key, &orig_key);
+ ret = btrfs_comp_keys(&found_key, &orig_key);
if (ret == 0) {
if (path->slots[0] > 0) {
path->slots[0]--;
@@ -2493,7 +2394,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
}
btrfs_item_key(path->nodes[0], &found_key, 0);
- ret = comp_keys(&found_key, &key);
+ ret = btrfs_comp_keys(&found_key, &key);
/*
* We might have had an item with the previous key in the tree right
* before we released our path. And after we released our path, that
@@ -2641,7 +2542,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
* higher levels
*
*/
-static void fixup_low_keys(struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
struct btrfs_disk_key *key, int level)
{
int i;
@@ -2658,7 +2560,7 @@ static void fixup_low_keys(struct btrfs_path *path,
BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
- btrfs_mark_buffer_dirty(path->nodes[i]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[i]);
if (tslot != 0)
break;
}
@@ -2670,10 +2572,11 @@ static void fixup_low_keys(struct btrfs_path *path,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_key *new_key)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_disk_key disk_key;
struct extent_buffer *eb;
int slot;
@@ -2682,7 +2585,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
- if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
+ if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2696,7 +2599,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
if (slot < btrfs_header_nritems(eb) - 1) {
btrfs_item_key(eb, &disk_key, slot + 1);
- if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
+ if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
btrfs_print_leaf(eb);
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
@@ -2711,9 +2614,9 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(eb, &disk_key, slot);
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
if (slot == 0)
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
/*
@@ -2844,8 +2747,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
}
btrfs_set_header_nritems(src, src_nritems - push_items);
btrfs_set_header_nritems(dst, dst_nritems + push_items);
- btrfs_mark_buffer_dirty(src);
- btrfs_mark_buffer_dirty(dst);
+ btrfs_mark_buffer_dirty(trans, src);
+ btrfs_mark_buffer_dirty(trans, dst);
return ret;
}
@@ -2920,8 +2823,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(src, src_nritems - push_items);
btrfs_set_header_nritems(dst, dst_nritems + push_items);
- btrfs_mark_buffer_dirty(src);
- btrfs_mark_buffer_dirty(dst);
+ btrfs_mark_buffer_dirty(trans, src);
+ btrfs_mark_buffer_dirty(trans, dst);
return ret;
}
@@ -2937,7 +2840,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
u64 lower_gen;
struct extent_buffer *lower;
struct extent_buffer *c;
@@ -2956,11 +2858,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&lower_key, level, root->node->start, 0,
- BTRFS_NESTING_NEW_ROOT);
+ 0, BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
btrfs_set_header_nritems(c, 1);
btrfs_set_node_key(c, &lower_key, 0);
@@ -2970,7 +2872,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(c, 0, lower_gen);
- btrfs_mark_buffer_dirty(c);
+ btrfs_mark_buffer_dirty(trans, c);
old = root->node;
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
@@ -3042,7 +2944,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans,
WARN_ON(trans->transid == 0);
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
btrfs_set_header_nritems(lower, nritems + 1);
- btrfs_mark_buffer_dirty(lower);
+ btrfs_mark_buffer_dirty(trans, lower);
return 0;
}
@@ -3100,11 +3002,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, level, c->start, 0,
- BTRFS_NESTING_SPLIT);
+ 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
ASSERT(btrfs_header_level(c) == level);
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
@@ -3121,8 +3023,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
- btrfs_mark_buffer_dirty(c);
- btrfs_mark_buffer_dirty(split);
+ btrfs_mark_buffer_dirty(trans, c);
+ btrfs_mark_buffer_dirty(trans, split);
ret = insert_ptr(trans, path, &disk_key, split->start,
path->slots[level + 1] + 1, level + 1);
@@ -3288,15 +3190,15 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(left, left_nritems);
if (left_nritems)
- btrfs_mark_buffer_dirty(left);
+ btrfs_mark_buffer_dirty(trans, left);
else
btrfs_clear_buffer_dirty(trans, left);
- btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(trans, right);
btrfs_item_key(right, &disk_key, 0);
btrfs_set_node_key(upper, &disk_key, slot + 1);
- btrfs_mark_buffer_dirty(upper);
+ btrfs_mark_buffer_dirty(trans, upper);
/* then fixup the leaf pointer in the path */
if (path->slots[0] >= left_nritems) {
@@ -3508,14 +3410,14 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_set_token_item_offset(&token, i, push_space);
}
- btrfs_mark_buffer_dirty(left);
+ btrfs_mark_buffer_dirty(trans, left);
if (right_nritems)
- btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(trans, right);
else
btrfs_clear_buffer_dirty(trans, right);
btrfs_item_key(right, &disk_key, 0);
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -3646,8 +3548,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
- btrfs_mark_buffer_dirty(right);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, right);
+ btrfs_mark_buffer_dirty(trans, l);
BUG_ON(path->slots[0] != slot);
if (mid <= slot) {
@@ -3851,13 +3753,13 @@ again:
* use BTRFS_NESTING_NEW_ROOT.
*/
right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
- &disk_key, 0, l->start, 0,
+ &disk_key, 0, l->start, 0, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
- root_add_used(root, fs_info->nodesize);
+ root_add_used_bytes(root);
if (split == 0) {
if (mid <= slot) {
@@ -3888,7 +3790,7 @@ again:
path->nodes[0] = right;
path->slots[0] = 0;
if (path->slots[1] == 0)
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
/*
* We create a new leaf 'right' for the required ins_len and
@@ -3987,7 +3889,8 @@ err:
return ret;
}
-static noinline int split_item(struct btrfs_path *path,
+static noinline int split_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
const struct btrfs_key *new_key,
unsigned long split_offset)
{
@@ -4046,7 +3949,7 @@ static noinline int split_item(struct btrfs_path *path,
write_extent_buffer(leaf, buf + split_offset,
btrfs_item_ptr_offset(leaf, slot),
item_size - split_offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
BUG_ON(btrfs_leaf_free_space(leaf) < 0);
kfree(buf);
@@ -4080,7 +3983,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = split_item(path, new_key, split_offset);
+ ret = split_item(trans, path, new_key, split_offset);
return ret;
}
@@ -4090,7 +3993,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -4166,11 +4070,11 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
btrfs_set_item_key(leaf, &disk_key, slot);
if (slot == 0)
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
btrfs_set_item_size(leaf, slot, new_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
@@ -4181,7 +4085,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
/*
* make the item pointed to by the path bigger, data_size is the added size.
*/
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
@@ -4231,7 +4136,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
data_end = old_data;
old_size = btrfs_item_size(leaf, slot);
btrfs_set_item_size(leaf, slot, old_size + data_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
@@ -4242,6 +4147,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
/*
* Make space in the node before inserting one or more items.
*
+ * @trans: transaction handle
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
* @batch: information about the batch of items to insert
@@ -4249,7 +4155,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
* Main purpose is to save stack depth by doing the bulk of the work in a
* function that doesn't call btrfs_search_slot
*/
-static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+static void setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_item_batch *batch)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4269,7 +4176,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
*/
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -4328,7 +4235,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
@@ -4339,12 +4246,14 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
/*
* Insert a new item into a leaf.
*
+ * @trans: Transaction handle.
* @root: The root of the btree.
* @path: A path pointing to the target leaf and slot.
* @key: The key of the new item.
* @data_size: The size of the data associated with the new key.
*/
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
const struct btrfs_key *key,
u32 data_size)
@@ -4356,7 +4265,7 @@ void btrfs_setup_item_for_insert(struct btrfs_root *root,
batch.total_data_size = data_size;
batch.nr = 1;
- setup_items_for_insert(root, path, &batch);
+ setup_items_for_insert(trans, root, path, &batch);
}
/*
@@ -4382,7 +4291,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, batch);
+ setup_items_for_insert(trans, root, path, batch);
return 0;
}
@@ -4407,7 +4316,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
leaf = path->nodes[0];
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, data, ptr, data_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
btrfs_free_path(path);
return ret;
@@ -4438,7 +4347,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- btrfs_setup_item_for_insert(root, path, new_key, item_size);
+ btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4496,9 +4405,9 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- fixup_low_keys(path, &disk_key, level + 1);
+ fixup_low_keys(trans, path, &disk_key, level + 1);
}
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
return 0;
}
@@ -4530,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
*/
btrfs_unlock_up_safe(path, 0);
- root_sub_used(root, leaf->len);
+ root_sub_used_bytes(root);
atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
@@ -4595,7 +4504,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- fixup_low_keys(path, &disk_key, 1);
+ fixup_low_keys(trans, path, &disk_key, 1);
}
/*
@@ -4660,11 +4569,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* dirtied this buffer
*/
if (path->nodes[0] == leaf)
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
free_extent_buffer(leaf);
}
} else {
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
}
return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9419f4e37a58..196c005c31f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,37 +6,10 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/highmem.h>
-#include <linux/fs.h>
-#include <linux/rwsem.h>
-#include <linux/semaphore.h>
-#include <linux/completion.h>
-#include <linux/backing-dev.h>
-#include <linux/wait.h>
-#include <linux/slab.h>
-#include <trace/events/btrfs.h>
-#include <asm/unaligned.h>
#include <linux/pagemap.h>
-#include <linux/btrfs.h>
-#include <linux/btrfs_tree.h>
-#include <linux/workqueue.h>
-#include <linux/security.h>
-#include <linux/sizes.h>
-#include <linux/dynamic_debug.h>
-#include <linux/refcount.h>
-#include <linux/crc32c.h>
-#include <linux/iomap.h>
-#include <linux/fscrypt.h>
-#include "extent-io-tree.h"
-#include "extent_io.h"
-#include "extent_map.h"
-#include "async-thread.h"
-#include "block-rsv.h"
#include "locking.h"
-#include "misc.h"
#include "fs.h"
+#include "accessors.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -218,10 +191,22 @@ struct btrfs_root {
atomic_t log_commit[2];
/* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
+ /*
+ * Protected by the 'log_mutex' lock but can be read without holding
+ * that lock to avoid unnecessary lock contention, in which case it
+ * should be read using btrfs_get_root_log_transid() except if it's a
+ * log tree in which case it can be directly accessed. Updates to this
+ * field should always use btrfs_set_root_log_transid(), except for log
+ * trees where the field can be updated directly.
+ */
int log_transid;
/* No matter the commit succeeds or not*/
int log_transid_committed;
- /* Just be updated when the commit succeeds. */
+ /*
+ * Just be updated when the commit succeeds. Use
+ * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit()
+ * to access this field.
+ */
int last_log_commit;
pid_t log_start_pid;
@@ -326,6 +311,9 @@ struct btrfs_root {
/* Used only by log trees, when logging csum items */
struct extent_io_tree log_csum_range;
+ /* Used in simple quotas, track root during relocation. */
+ u64 relocation_src_root;
+
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
@@ -352,6 +340,26 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root)
return root->root_key.objectid;
}
+static inline int btrfs_get_root_log_transid(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->log_transid);
+}
+
+static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid)
+{
+ WRITE_ONCE(root->log_transid, log_transid);
+}
+
+static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->last_log_commit);
+}
+
+static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id)
+{
+ WRITE_ONCE(root->last_log_commit, commit_id);
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
@@ -470,30 +478,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
((bytes) >> (fs_info)->sectorsize_bits)
-static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
-{
- return crc32c(crc, address, length);
-}
-
-static inline void btrfs_crc32c_final(u32 crc, u8 *result)
-{
- put_unaligned_le32(~crc, result);
-}
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
- return crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
- int len)
-{
- return (u64) crc32c(parent_objectid, name, len);
-}
-
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_constraint(mapping, ~__GFP_FS);
@@ -513,12 +497,42 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+
+#ifdef __LITTLE_ENDIAN
+
+/*
+ * Compare two keys, on little-endian the disk order is same as CPU order and
+ * we can avoid the conversion.
+ */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key,
+ const struct btrfs_key *k2)
+{
+ const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
+
+ return btrfs_comp_cpu_keys(k1, k2);
+}
+
+#else
+
+/* Compare two keys in a memcmp fashion. */
+static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk,
+ const struct btrfs_key *k2)
+{
+ struct btrfs_key k1;
+
+ btrfs_disk_key_to_cpu(&k1, disk);
+
+ return btrfs_comp_cpu_keys(&k1, k2);
+}
+
+#endif
+
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
@@ -536,16 +550,26 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
enum btrfs_lock_nesting nest);
+int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ struct extent_buffer *parent, int parent_slot,
+ struct extent_buffer **cow_ret,
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_root *root,
+int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
-void btrfs_extend_item(struct btrfs_path *path, u32 data_size);
-void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 data_size);
+void btrfs_truncate_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -566,10 +590,6 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_path *p, int find_higher,
int return_any);
-int btrfs_realloc_node(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *parent,
- int start_slot, u64 *last_ret,
- struct btrfs_key *progress);
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
@@ -609,7 +629,8 @@ struct btrfs_item_batch {
int nr;
};
-void btrfs_setup_item_for_insert(struct btrfs_root *root,
+void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
struct btrfs_path *path,
const struct btrfs_key *key,
u32 data_size);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index f2ff4cbe8656..5244561e2016 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -338,13 +338,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
}
/*
+ * Check if two blocks addresses are close, used by defrag.
+ */
+static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+ if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
+ return true;
+ if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
+ return true;
+ return false;
+}
+
+/*
+ * Go through all the leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order.
+ */
+static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *parent,
+ int start_slot, u64 *last_ret,
+ struct btrfs_key *progress)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ const u32 blocksize = fs_info->nodesize;
+ const int end_slot = btrfs_header_nritems(parent) - 1;
+ u64 search_start = *last_ret;
+ u64 last_block = 0;
+ int ret = 0;
+ bool progress_passed = false;
+
+ /*
+ * COWing must happen through a running transaction, which always
+ * matches the current fs generation (it's a transaction with a state
+ * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
+ * into error state to prevent the commit of any transaction.
+ */
+ if (unlikely(trans->transaction != fs_info->running_transaction ||
+ trans->transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
+ parent->start, btrfs_root_id(root), trans->transid,
+ fs_info->running_transaction->transid,
+ fs_info->generation);
+ return -EUCLEAN;
+ }
+
+ if (btrfs_header_nritems(parent) <= 1)
+ return 0;
+
+ for (int i = start_slot; i <= end_slot; i++) {
+ struct extent_buffer *cur;
+ struct btrfs_disk_key disk_key;
+ u64 blocknr;
+ u64 other;
+ bool close = true;
+
+ btrfs_node_key(parent, &disk_key, i);
+ if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
+ continue;
+
+ progress_passed = true;
+ blocknr = btrfs_node_blockptr(parent, i);
+ if (last_block == 0)
+ last_block = blocknr;
+
+ if (i > 0) {
+ other = btrfs_node_blockptr(parent, i - 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (!close && i < end_slot) {
+ other = btrfs_node_blockptr(parent, i + 1);
+ close = close_blocks(blocknr, other, blocksize);
+ }
+ if (close) {
+ last_block = blocknr;
+ continue;
+ }
+
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
+ if (search_start == 0)
+ search_start = last_block;
+
+ btrfs_tree_lock(cur);
+ ret = btrfs_force_cow_block(trans, root, cur, parent, i,
+ &cur, search_start,
+ min(16 * blocksize,
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
+ if (ret) {
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ break;
+ }
+ search_start = cur->start;
+ last_block = cur->start;
+ *last_ret = search_start;
+ btrfs_tree_unlock(cur);
+ free_extent_buffer(cur);
+ }
+ return ret;
+}
+
+/*
* Defrag all the leaves in a given btree.
* Read all the leaves and try to get key order to
* better reflect disk order
*/
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -461,6 +566,45 @@ done:
}
/*
+ * Defrag a given btree. Every leaf in the btree is read and defragmented.
+ */
+int btrfs_defrag_root(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
+ return 0;
+
+ while (1) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+
+ ret = btrfs_defrag_leaves(trans, root);
+
+ btrfs_end_transaction(trans);
+ btrfs_btree_balance_dirty(fs_info);
+ cond_resched();
+
+ if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
+ break;
+
+ if (btrfs_defrag_cancelled(fs_info)) {
+ btrfs_debug(fs_info, "defrag_root cancelled");
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
+ return ret;
+}
+
+/*
* Defrag specific helper to get an extent map.
*
* Differences between this and btrfs_get_extent() are:
@@ -891,8 +1035,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* very likely resulting in a larger extent after writeback is
* triggered (except in a case of free space fragmentation).
*/
- if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC, 0, NULL))
+ if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC))
goto next;
/*
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 5305f2283b5e..5a62763528d1 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -12,7 +12,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
-int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
{
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 427abaf608b8..51453d4928fa 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -322,9 +322,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
- if (btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
@@ -346,7 +343,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
noflush);
if (ret)
return ret;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
+ meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 53c1211dd60b..7381241334e8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -313,7 +313,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
{
struct btrfs_delayed_item *item;
- item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+ item = kmalloc(struct_size(item, data, data_len), GFP_NOFS);
if (item) {
item->data_len = data_len;
item->type = type;
@@ -328,7 +328,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
}
/*
- * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * Look up the delayed item by key.
+ *
* @delayed_node: pointer to the delayed node
* @index: the dir index value to lookup (offset of a dir index key)
*
@@ -412,6 +413,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
+ struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node;
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
@@ -419,18 +421,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
if (RB_EMPTY_NODE(&delayed_item->rb_node))
return;
- delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+ /* If it's in a rbtree, then we need to have delayed node locked. */
+ lockdep_assert_held(&delayed_node->mutex);
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
- root = &delayed_item->delayed_node->ins_root;
+ root = &delayed_node->ins_root;
else
- root = &delayed_item->delayed_node->del_root;
+ root = &delayed_node->del_root;
rb_erase_cached(&delayed_item->rb_node, root);
RB_CLEAR_NODE(&delayed_item->rb_node);
- delayed_item->delayed_node->count--;
+ delayed_node->count--;
finish_one_item(delayed_root);
}
@@ -513,7 +518,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
/*
* For insertions we track reserved metadata space by accounting
* for the number of leaves that will be used, based on the delayed
- * node's index_items_size field.
+ * node's curr_index_batch_size and index_item_leaves fields.
*/
if (item->type == BTRFS_DELAYED_DELETION_ITEM)
item->bytes_reserved = num_bytes;
@@ -1026,7 +1031,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
sizeof(struct btrfs_inode_item));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
@@ -1153,20 +1158,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
if (ret) {
- btrfs_release_delayed_node(curr_node);
- curr_node = NULL;
btrfs_abort_transaction(trans, ret);
break;
}
prev_node = curr_node;
curr_node = btrfs_next_delayed_node(curr_node);
+ /*
+ * See the comment below about releasing path before releasing
+ * node. If the commit of delayed items was successful the path
+ * should always be released, but in case of an error, it may
+ * point to locked extent buffers (a leaf at the very least).
+ */
+ ASSERT(path->nodes[0] == NULL);
btrfs_release_delayed_node(prev_node);
}
+ /*
+ * Release the path to avoid a potential deadlock and lockdep splat when
+ * releasing the delayed node, as that requires taking the delayed node's
+ * mutex. If another task starts running delayed items before we take
+ * the mutex, it will first lock the mutex and then it may try to lock
+ * the same btree path (leaf).
+ */
+ btrfs_free_path(path);
+
if (curr_node)
btrfs_release_delayed_node(curr_node);
- btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
@@ -1361,8 +1379,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
- NULL);
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL);
async_work->nr = nr;
btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
}
-/* Will return 0 or -ENOMEM */
+static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+}
+
+/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
@@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
+ /*
+ * First attempt to insert the delayed item. This is to make the error
+ * handling path simpler in case we fail (-EEXIST). There's no risk of
+ * any other task coming in and running the delayed item before we do
+ * the metadata space reservation below, because we are holding the
+ * delayed node's mutex and that mutex must also be locked before the
+ * node's delayed items can be run.
+ */
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
+ if (unlikely(ret)) {
+ btrfs_err(trans->fs_info,
+"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d",
+ name_len, name, index, btrfs_root_id(delayed_node->root),
+ delayed_node->inode_id, dir->index_cnt,
+ delayed_node->index_cnt, ret);
+ btrfs_release_delayed_item(delayed_item);
+ btrfs_release_dir_index_item_space(trans);
+ mutex_unlock(&delayed_node->mutex);
+ goto release_node;
+ }
+
if (delayed_node->index_item_leaves == 0 ||
delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
delayed_node->curr_index_batch_size = data_len;
@@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
* impossible.
*/
if (WARN_ON(ret)) {
- mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_item(delayed_item);
+ mutex_unlock(&delayed_node->mutex);
goto release_node;
}
delayed_node->index_item_leaves++;
- } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
- const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
-
- /*
- * Adding the new dir index item does not require touching another
- * leaf, so we can release 1 unit of metadata that was previously
- * reserved when starting the transaction. This applies only to
- * the case where we had a transaction start and excludes the
- * transaction join case (when replaying log trees).
- */
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, bytes, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
- ASSERT(trans->bytes_reserved >= bytes);
- trans->bytes_reserved -= bytes;
- }
-
- ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
- if (unlikely(ret)) {
- btrfs_err(trans->fs_info,
- "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- name_len, name, delayed_node->root->root_key.objectid,
- delayed_node->inode_id, ret);
- BUG();
+ } else {
+ btrfs_release_dir_index_item_space(trans);
}
mutex_unlock(&delayed_node->mutex);
@@ -1722,8 +1760,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
}
/*
- * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
- *
+ * Read dir info stored in the delayed tree.
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list)
@@ -1796,24 +1833,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_stack_timespec_sec(&inode_item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
- btrfs_set_stack_timespec_sec(&inode_item->otime,
- BTRFS_I(inode)->i_otime.tv_sec);
- btrfs_set_stack_timespec_nsec(&inode_item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec);
+ btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
}
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
@@ -1853,19 +1888,17 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
- inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
- inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+ inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+ btrfs_stack_timespec_nsec(&inode_item->atime));
- inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
- inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
+ inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+ btrfs_stack_timespec_nsec(&inode_item->mtime));
inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
btrfs_stack_timespec_nsec(&inode_item->ctime));
- BTRFS_I(inode)->i_otime.tv_sec =
- btrfs_stack_timespec_sec(&inode_item->otime);
- BTRFS_I(inode)->i_otime.tv_nsec =
- btrfs_stack_timespec_nsec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1876,9 +1909,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
}
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_delayed_node *delayed_node;
int ret = 0;
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index dc1085b2a397..5cceb31bbd16 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -95,7 +95,7 @@ struct btrfs_delayed_item {
bool logged;
/* The maximum leaf size is 64K, so u16 is more than enough. */
u16 data_len;
- char data[];
+ char data[] __counted_by(data_len);
};
static inline void btrfs_init_delayed_root(
@@ -135,7 +135,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode);
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6a13cf00218b..9223934d95f4 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -57,16 +57,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
* Release a ref head's reservation.
*
* @fs_info: the filesystem
- * @nr: number of items to drop
+ * @nr_refs: number of delayed refs to drop
+ * @nr_csums: number of csum items to drop
*
* Drops the delayed ref head's count from the delayed refs rsv and free any
* excess reservation we had.
*/
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
- const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
- u64 released = 0;
+ u64 num_bytes;
+ u64 released;
+
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs);
+ num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
@@ -77,50 +81,135 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
/*
* Adjust the size of the delayed refs rsv.
*
- * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
- * it'll calculate the additional size and add it to the delayed_refs_rsv.
+ * This is to be called anytime we may have adjusted trans->delayed_ref_updates
+ * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and
+ * add it to the delayed_refs_rsv.
*/
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv;
u64 num_bytes;
+ u64 reserved_bytes;
+
+ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
+ num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
+ trans->delayed_ref_csum_deletions);
- if (!trans->delayed_ref_updates)
+ if (num_bytes == 0)
return;
- num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
- trans->delayed_ref_updates);
+ /*
+ * Try to take num_bytes from the transaction's local delayed reserve.
+ * If not possible, try to take as much as it's available. If the local
+ * reserve doesn't have enough reserved space, the delayed refs reserve
+ * will be refilled next time btrfs_delayed_refs_rsv_refill() is called
+ * by someone or if a transaction commit is triggered before that, the
+ * global block reserve will be used. We want to minimize using the
+ * global block reserve for cases we can account for in advance, to
+ * avoid exhausting it and reach -ENOSPC during a transaction commit.
+ */
+ spin_lock(&local_rsv->lock);
+ reserved_bytes = min(num_bytes, local_rsv->reserved);
+ local_rsv->reserved -= reserved_bytes;
+ local_rsv->full = (local_rsv->reserved >= local_rsv->size);
+ spin_unlock(&local_rsv->lock);
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
- delayed_rsv->full = false;
+ delayed_rsv->reserved += reserved_bytes;
+ delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size);
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
+ trans->delayed_ref_csum_deletions = 0;
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * insertion, used after allocating a block group.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+ spin_lock(&delayed_rsv->lock);
+ /*
+ * Inserting a block group item does not require changing the free space
+ * tree, only the extent tree or the block group tree, so this is all we
+ * need.
+ */
+ delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1);
+ delayed_rsv->full = false;
+ spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item insertion.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+ u64 released;
+
+ released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve for 1 block group item
+ * update.
+ */
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+
+ spin_lock(&delayed_rsv->lock);
+ /*
+ * Updating a block group item does not result in new nodes/leaves and
+ * does not require changing the free space tree, only the extent tree
+ * or the block group tree, so this is all we need.
+ */
+ delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1);
+ delayed_rsv->full = false;
+ spin_unlock(&delayed_rsv->lock);
+}
+
+/*
+ * Adjust the size of the delayed refs block reserve to release space for 1
+ * block group item update.
+ */
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
+ const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+ u64 released;
+
+ released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
+ if (released > 0)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 0, released, 0);
}
/*
* Transfer bytes to our delayed refs rsv.
*
* @fs_info: the filesystem
- * @src: source block rsv to transfer from
* @num_bytes: number of bytes to transfer
*
- * This transfers up to the num_bytes amount from the src rsv to the
+ * This transfers up to the num_bytes amount, previously reserved, to the
* delayed_refs_rsv. Any extra bytes are returned to the space info.
*/
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *src,
u64 num_bytes)
{
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
u64 to_free = 0;
- spin_lock(&src->lock);
- src->reserved -= num_bytes;
- src->size -= num_bytes;
- spin_unlock(&src->lock);
-
spin_lock(&delayed_refs_rsv->lock);
if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
u64 delta = delayed_refs_rsv->size -
@@ -161,8 +250,11 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_space_info *space_info = block_rsv->space_info;
u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
u64 num_bytes = 0;
+ u64 refilled_bytes;
+ u64 to_free;
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
@@ -175,12 +267,40 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
if (!num_bytes)
return 0;
- ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
if (ret)
return ret;
- btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
- trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, num_bytes, 1);
+
+ /*
+ * We may have raced with someone else, so check again if we the block
+ * reserve is still not full and release any excess space.
+ */
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ u64 needed = block_rsv->size - block_rsv->reserved;
+
+ if (num_bytes >= needed) {
+ block_rsv->reserved += needed;
+ block_rsv->full = true;
+ to_free = num_bytes - needed;
+ refilled_bytes = needed;
+ } else {
+ block_rsv->reserved += num_bytes;
+ to_free = 0;
+ refilled_bytes = num_bytes;
+ }
+ } else {
+ to_free = num_bytes;
+ refilled_bytes = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (to_free > 0)
+ btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
+
+ if (refilled_bytes > 0)
+ trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
+ refilled_bytes, 1);
return 0;
}
@@ -398,7 +518,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
return 0;
}
-static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
@@ -409,9 +530,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs,
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
-static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
+static bool merge_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref,
u64 seq)
@@ -440,10 +563,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs,
mod = -next->ref_mod;
}
- drop_delayed_ref(delayed_refs, head, next);
+ drop_delayed_ref(fs_info, delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
- drop_delayed_ref(delayed_refs, head, ref);
+ drop_delayed_ref(fs_info, delayed_refs, head, ref);
done = true;
} else {
/*
@@ -481,7 +604,7 @@ again:
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
- if (merge_ref(delayed_refs, head, ref, seq))
+ if (merge_ref(fs_info, delayed_refs, head, ref, seq))
goto again;
}
}
@@ -560,10 +683,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
* Return true if the ref was merged into an existing one (and therefore can be
* freed by the caller).
*/
-static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
+static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
+ struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
struct btrfs_delayed_ref_node *exist;
int mod;
@@ -574,6 +698,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
+ trans->delayed_ref_updates++;
return false;
}
@@ -602,7 +727,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root,
/* remove existing tail if its ref_mod is zero */
if (exist->ref_mod == 0)
- drop_delayed_ref(root, href, exist);
+ drop_delayed_ref(trans->fs_info, root, href, exist);
spin_unlock(&href->lock);
return true;
}
@@ -623,6 +748,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
BUG_ON(existing->is_data != update->is_data);
spin_lock(&existing->lock);
+
+ /*
+ * When freeing an extent, we may not know the owning root when we
+ * first create the head_ref. However, some deref before the last deref
+ * will know it, so we just need to update the head_ref accordingly.
+ */
+ if (!existing->owning_root)
+ existing->owning_root = update->owning_root;
+
if (update->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
@@ -632,6 +766,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
* Set it again here
*/
existing->must_insert_reserved = update->must_insert_reserved;
+ existing->owning_root = update->owning_root;
/*
* update the num_bytes so we make sure the accounting
@@ -671,6 +806,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
/*
* If we are going to from a positive ref mod to a negative or vice
* versa we need to make sure to adjust pending_csums accordingly.
+ * We reserve bytes for csum deletion when adding or updating a ref head
+ * see add_delayed_ref_head() for more details.
*/
if (existing->is_data) {
u64 csum_leaves =
@@ -679,11 +816,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
delayed_refs->pending_csums -= existing->num_bytes;
- btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
+ btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves);
}
if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
delayed_refs->pending_csums += existing->num_bytes;
- trans->delayed_ref_updates += csum_leaves;
+ trans->delayed_ref_csum_deletions += csum_leaves;
}
}
@@ -694,7 +831,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr, u64 num_bytes, u64 ref_root,
u64 reserved, int action, bool is_data,
- bool is_system)
+ bool is_system, u64 owning_root)
{
int count_mod = 1;
bool must_insert_reserved = false;
@@ -734,7 +871,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->bytenr = bytenr;
head_ref->num_bytes = num_bytes;
head_ref->ref_mod = count_mod;
+ head_ref->reserved_bytes = reserved;
head_ref->must_insert_reserved = must_insert_reserved;
+ head_ref->owning_root = owning_root;
head_ref->is_data = is_data;
head_ref->is_system = is_system;
head_ref->ref_tree = RB_ROOT_CACHED;
@@ -795,16 +934,21 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ /*
+ * We reserve the amount of bytes needed to delete csums when
+ * adding the ref head and not when adding individual drop refs
+ * since the csum items are deleted only after running the last
+ * delayed drop ref (the data extent's ref count drops to 0).
+ */
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
- trans->delayed_ref_updates +=
+ trans->delayed_ref_csum_deletions +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
@@ -813,8 +957,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
/*
- * init_delayed_ref_common - Initialize the structure which represents a
- * modification to a an extent.
+ * Initialize the structure which represents a modification to a an extent.
*
* @fs_info: Internal to the mounted filesystem mount structure.
*
@@ -885,7 +1028,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
u64 parent = generic_ref->parent;
u8 ref_type;
- is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
+ is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
@@ -898,8 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -914,15 +1056,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
- generic_ref->tree_ref.owning_root, action,
+ generic_ref->tree_ref.ref_root, action,
ref_type);
- ref->root = generic_ref->tree_ref.owning_root;
+ ref->root = generic_ref->tree_ref.ref_root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- generic_ref->tree_ref.owning_root, 0, action,
- false, is_system);
+ generic_ref->tree_ref.ref_root, 0, action,
+ false, is_system, generic_ref->owning_root);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -935,7 +1077,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -974,7 +1116,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root = generic_ref->data_ref.owning_root;
+ u64 ref_root = generic_ref->data_ref.ref_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
@@ -1002,8 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- !generic_ref->skip_qgroup) {
+ if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -1014,7 +1155,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false);
+ reserved, action, true, false, generic_ref->owning_root);
head_ref->extent_op = NULL;
delayed_refs = &trans->transaction->delayed_refs;
@@ -1027,7 +1168,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
- merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node);
+ merged = insert_delayed_ref(trans, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
@@ -1060,7 +1201,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, false, false);
+ BTRFS_UPDATE_DELAYED_HEAD, false, false, 0);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index b8e14b0ba5f1..62d679d40f4f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -9,10 +9,16 @@
#include <linux/refcount.h>
/* these are the possible values of struct btrfs_delayed_ref_node->action */
-#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
-#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
-#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
-#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+enum btrfs_delayed_ref_action {
+ /* Add one backref to the tree */
+ BTRFS_ADD_DELAYED_REF = 1,
+ /* Delete one backref from the tree */
+ BTRFS_DROP_DELAYED_REF,
+ /* Record a full extent allocation */
+ BTRFS_ADD_DELAYED_EXTENT,
+ /* Not changing ref count on head ref */
+ BTRFS_UPDATE_DELAYED_HEAD,
+} __packed;
struct btrfs_delayed_ref_node {
struct rb_node ref_node;
@@ -105,6 +111,18 @@ struct btrfs_delayed_ref_head {
int ref_mod;
/*
+ * The root that triggered the allocation when must_insert_reserved is
+ * set to true.
+ */
+ u64 owning_root;
+
+ /*
+ * Track reserved bytes when setting must_insert_reserved. On success
+ * or cleanup, we will need to free the reservation.
+ */
+ u64 reserved_bytes;
+
+ /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -117,6 +135,7 @@ struct btrfs_delayed_ref_head {
* the free has happened.
*/
bool must_insert_reserved;
+
bool is_data;
bool is_system;
bool processing;
@@ -183,13 +202,13 @@ enum btrfs_ref_type {
BTRFS_REF_DATA,
BTRFS_REF_METADATA,
BTRFS_REF_LAST,
-};
+} __packed;
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
- /* Original root this data extent belongs to */
- u64 owning_root;
+ /* Root which owns this data reference. */
+ u64 ref_root;
/* Inode which refers to this data extent */
u64 ino;
@@ -212,18 +231,18 @@ struct btrfs_tree_ref {
int level;
/*
- * Root which owns this tree block.
+ * Root which owns this tree block reference.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
- u64 owning_root;
+ u64 ref_root;
/* For non-skinny metadata, no special member needed */
};
struct btrfs_ref {
enum btrfs_ref_type type;
- int action;
+ enum btrfs_delayed_ref_action action;
/*
* Whether this extent should go through qgroup record.
@@ -239,6 +258,7 @@ struct btrfs_ref {
#endif
u64 bytenr;
u64 len;
+ u64 owning_root;
/* Bytenr of the parent tree block */
u64 parent;
@@ -277,24 +297,37 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in
return num_bytes;
}
+static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
+ int num_csum_items)
+{
+ /*
+ * Deleting csum items does not result in new nodes/leaves and does not
+ * require changing the free space tree, only the csum tree, so this is
+ * all we need.
+ */
+ return btrfs_calc_metadata_size(fs_info, num_csum_items);
+}
+
static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
- int action, u64 bytenr, u64 len, u64 parent)
+ int action, u64 bytenr, u64 len,
+ u64 parent, u64 owning_root)
{
generic_ref->action = action;
generic_ref->bytenr = bytenr;
generic_ref->len = len;
generic_ref->parent = parent;
+ generic_ref->owning_root = owning_root;
}
-static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
- int level, u64 root, u64 mod_root, bool skip_qgroup)
+static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level,
+ u64 root, u64 mod_root, bool skip_qgroup)
{
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: root;
#endif
generic_ref->tree_ref.level = level;
- generic_ref->tree_ref.owning_root = root;
+ generic_ref->tree_ref.ref_root = root;
generic_ref->type = BTRFS_REF_METADATA;
if (skip_qgroup || !(is_fstree(root) &&
(!mod_root || is_fstree(mod_root))))
@@ -312,7 +345,7 @@ static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
/* If @real_root not set, use @root as fallback */
generic_ref->real_root = mod_root ?: ref_root;
#endif
- generic_ref->data_ref.owning_root = ref_root;
+ generic_ref->data_ref.ref_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
@@ -338,7 +371,6 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
- WARN_ON(refcount_read(&ref->refs) == 0);
if (refcount_dec_and_test(&ref->refs)) {
WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
switch (ref->type) {
@@ -402,12 +434,15 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
-void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
+void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
+void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
+void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *src,
u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index fff22ed55c42..f9544fda38e9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -17,7 +17,6 @@
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
-#include "check-integrity.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
@@ -247,6 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@@ -257,12 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return -EINVAL;
}
- bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ fs_info->bdev_holder, NULL);
+ if (IS_ERR(bdev_handle)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
- return PTR_ERR(bdev);
+ return PTR_ERR(bdev_handle);
}
+ bdev = bdev_handle->bdev;
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
@@ -313,9 +314,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
+ device->bdev_handle = bdev_handle;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->holder = fs_info->bdev_holder;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_devices;
@@ -334,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- blkdev_put(bdev, fs_info->bdev_holder);
+ bdev_release(bdev_handle);
return ret;
}
@@ -442,7 +443,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 082eb0e19598..9c07d5c3e5ad 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -38,7 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
- btrfs_extend_item(path, data_size);
+ btrfs_extend_item(trans, path, data_size);
} else if (ret < 0)
return ERR_PTR(ret);
WARN_ON(ret > 0);
@@ -93,7 +93,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, name_ptr, name_len);
write_extent_buffer(leaf, data, data_ptr, data_len);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
return ret;
}
@@ -153,7 +153,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
name_ptr = (unsigned long)(dir_item + 1);
write_extent_buffer(leaf, name->name, name_ptr, name->len);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
second_insert:
/* FIXME, use some real flag for selecting the extra index */
@@ -439,7 +439,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
- btrfs_truncate_item(path, item_len - sub_item_len, 1);
+ btrfs_truncate_item(trans, path, item_len - sub_item_len, 1);
}
return ret;
}
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index aab4b7cc7fa0..e40a226373d7 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -3,6 +3,10 @@
#ifndef BTRFS_DIR_ITEM_H
#define BTRFS_DIR_ITEM_H
+#include <linux/crc32c.h>
+
+struct fscrypt_str;
+
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
@@ -39,4 +43,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
const char *name,
int name_len);
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return crc32c((u32)~1, name, len);
+}
+
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0a96ea8c1d3a..401ea09ae4b8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"
@@ -245,6 +244,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start = btrfs_header_bytenr(eb);
+ u64 last_trans;
u8 result[BTRFS_CSUM_SIZE];
int ret;
@@ -282,12 +282,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
* Also check the generation, the eb reached here must be newer than
* last committed. Or something seriously wrong happened.
*/
- if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ last_trans = btrfs_get_last_trans_committed(fs_info);
+ if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"block=%llu bad generation, have %llu expect > %llu",
- eb->start, btrfs_header_generation(eb),
- fs_info->last_trans_committed);
+ eb->start, btrfs_header_generation(eb), last_trans);
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
@@ -318,9 +318,10 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
BTRFS_FSID_SIZE);
/*
- * alloc_fs_devices() copies the fsid into metadata_uuid if the
- * metadata_uuid is unset in the superblock, including for a seed device.
- * So, we can use fs_devices->metadata_uuid.
+ * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
+ * This is then overwritten by metadata_uuid if it is present in the
+ * device_list_add(). The same true for a seed device as well. So use of
+ * fs_devices::metadata_uuid is appropriate here.
*/
if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
return false;
@@ -520,6 +521,7 @@ static bool btree_dirty_folio(struct address_space *mapping,
struct folio *folio)
{
struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ struct btrfs_subpage_info *spi = fs_info->subpage_info;
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
@@ -533,18 +535,19 @@ static bool btree_dirty_folio(struct address_space *mapping,
btrfs_assert_tree_write_locked(eb);
return filemap_dirty_folio(mapping, folio);
}
+
+ ASSERT(spi);
subpage = folio_get_private(folio);
- ASSERT(subpage->dirty_bitmap);
- while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
+ for (cur_bit = spi->dirty_offset;
+ cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
+ cur_bit++) {
unsigned long flags;
u64 cur;
- u16 tmp = (1 << cur_bit);
spin_lock_irqsave(&subpage->lock, flags);
- if (!(tmp & subpage->dirty_bitmap)) {
+ if (!test_bit(cur_bit, subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- cur_bit++;
continue;
}
spin_unlock_irqrestore(&subpage->lock, flags);
@@ -557,7 +560,7 @@ static bool btree_dirty_folio(struct address_space *mapping,
btrfs_assert_tree_write_locked(eb);
free_extent_buffer(eb);
- cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
+ cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
}
return filemap_dirty_folio(mapping, folio);
}
@@ -673,9 +676,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
refcount_set(&root->refs, 1);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
- root->log_transid = 0;
+ btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- root->last_log_commit = 0;
+ btrfs_set_root_last_log_commit(root, 0);
root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
@@ -857,7 +860,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.offset = 0;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
- BTRFS_NESTING_NORMAL);
+ 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -865,7 +868,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
}
root->node = leaf;
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
root->commit_root = btrfs_root_node(root);
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -934,13 +937,13 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
+ NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf))
return PTR_ERR(leaf);
root->node = leaf;
- btrfs_mark_buffer_dirty(root->node);
+ btrfs_mark_buffer_dirty(trans, root->node);
btrfs_tree_unlock(root->node);
return 0;
@@ -1002,9 +1005,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
- root->log_transid = 0;
+ btrfs_set_root_log_transid(root, 0);
root->log_transid_committed = -1;
- root->last_log_commit = 0;
+ btrfs_set_root_last_log_commit(root, 0);
return 0;
}
@@ -1177,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
return btrfs_grab_root(fs_info->block_group_root);
case BTRFS_FREE_SPACE_TREE_OBJECTID:
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
+ case BTRFS_RAID_STRIPE_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->stripe_root);
default:
return NULL;
}
@@ -1257,6 +1262,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
btrfs_put_root(fs_info->block_group_root);
+ btrfs_put_root(fs_info->stripe_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -1400,7 +1406,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_get_fs_root_commit_root - return a root for the given objectid
+ * Return a root for the given objectid.
+ *
* @fs_info: the fs_info
* @objectid: the objectid we need to lookup
*
@@ -1547,7 +1554,7 @@ static int transaction_kthread(void *arg)
delta = ktime_get_seconds() - cur->start_time;
if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
- cur->state < TRANS_STATE_COMMIT_START &&
+ cur->state < TRANS_STATE_COMMIT_PREP &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
delay -= msecs_to_jiffies((delta - 1) * 1000);
@@ -1697,11 +1704,11 @@ static void backup_super_roots(struct btrfs_fs_info *info)
}
/*
- * read_backup_root - Reads a backup root based on the passed priority. Prio 0
- * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
+ * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
+ * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
*
- * fs_info - filesystem whose backup roots need to be read
- * priority - priority of backup root required
+ * @fs_info: filesystem whose backup roots need to be read
+ * @priority: priority of backup root required
*
* Returns backup root index on success and -EINVAL otherwise.
*/
@@ -1801,6 +1808,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
free_root_extent_buffers(info->block_group_root);
+ free_root_extent_buffers(info->stripe_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -2260,7 +2268,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(root)) {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
fs_info->quota_root = root;
}
@@ -2277,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
fs_info->uuid_root = root;
}
+ if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
+ location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->stripe_root = root;
+ }
+ }
+
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
@@ -2379,7 +2400,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
+ if (!fs_info->fs_devices->temp_fsid &&
+ memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
sb->fsid, fs_info->fs_devices->fsid);
@@ -2632,7 +2654,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
/* All successful */
fs_info->generation = btrfs_header_generation(tree_root->node);
- fs_info->last_trans_committed = fs_info->generation;
+ btrfs_set_last_trans_committed(fs_info, fs_info->generation);
fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
@@ -2682,8 +2704,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
- btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
- BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
+ BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
BTRFS_LOCKDEP_TRANS_UNBLOCKED);
btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
@@ -2733,9 +2755,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->ordered_root_lock);
btrfs_init_scrub(fs_info);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- fs_info->check_integrity_print_mask = 0;
-#endif
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
@@ -3155,7 +3174,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
u32 nodesize;
u32 stripesize;
u64 generation;
- u64 features;
u16 csum_type;
struct btrfs_super_block *disk_super;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -3237,15 +3255,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
disk_super = fs_info->super_copy;
-
- features = btrfs_super_flags(disk_super);
- if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
- features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
- btrfs_set_super_flags(disk_super, features);
- btrfs_info(fs_info,
- "found metadata UUID change in progress flag, clearing");
- }
-
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_for_commit));
@@ -3507,18 +3516,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
"auto enabling async discard");
}
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
- ret = btrfsic_mount(fs_info, fs_devices,
- btrfs_test_opt(fs_info,
- CHECK_INTEGRITY_DATA) ? 1 : 0,
- fs_info->check_integrity_print_mask);
- if (ret)
- btrfs_warn(fs_info,
- "failed to initialize integrity check module: %d",
- ret);
- }
-#endif
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3818,8 +3815,6 @@ static int write_dev_supers(struct btrfs_device *device,
*/
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
-
- btrfsic_check_bio(bio);
submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
@@ -3915,28 +3910,11 @@ static void write_dev_flush(struct btrfs_device *device)
device->last_flush_error = BLK_STS_OK;
-#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- /*
- * When a disk has write caching disabled, we skip submission of a bio
- * with flush and sync requests before writing the superblock, since
- * it's not needed. However when the integrity checker is enabled, this
- * results in reports that there are metadata blocks referred by a
- * superblock that were not properly flushed. So don't skip the bio
- * submission only when the integrity checker is enabled for the sake
- * of simplicity, since this is a debug tool and not meant for use in
- * non-debug builds.
- */
- if (!bdev_write_cache(device->bdev))
- return;
-#endif
-
bio_init(bio, device->bdev, NULL, 0,
REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
-
- btrfsic_check_bio(bio);
submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@@ -4412,16 +4390,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
- btrfsic_unmount(fs_info->fs_devices);
-#endif
-
btrfs_mapping_tree_free(&fs_info->mapping_tree);
btrfs_close_devices(fs_info->fs_devices);
}
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+ struct extent_buffer *buf)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
@@ -4435,21 +4409,16 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
+ /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
+ ASSERT(trans->transid == fs_info->generation);
btrfs_assert_tree_write_locked(buf);
- if (transid != fs_info->generation)
- WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
- buf->start, transid, fs_info->generation);
- set_extent_buffer_dirty(buf);
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- /*
- * btrfs_check_leaf() won't check item data if we don't have WRITTEN
- * set, so this will only validate the basic structure of the items.
- */
- if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) {
- btrfs_print_leaf(buf);
- ASSERT(0);
+ if (unlikely(transid != fs_info->generation)) {
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ btrfs_crit(fs_info,
+"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
+ buf->start, transid, fs_info->generation);
}
-#endif
+ set_extent_buffer_dirty(buf);
}
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
@@ -4609,6 +4578,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
if (head->must_insert_reserved)
pin_bytes = true;
@@ -4806,7 +4776,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_put_block_group(cache);
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4870,7 +4840,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
while (!list_empty(&fs_info->trans_list)) {
t = list_first_entry(&fs_info->trans_list,
struct btrfs_transaction, list);
- if (t->state >= TRANS_STATE_COMMIT_START) {
+ if (t->state >= TRANS_STATE_COMMIT_PREP) {
refcount_inc(&t->use_count);
spin_unlock(&fs_info->trans_lock);
btrfs_wait_for_commit(fs_info, t->transid);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 02b645744a82..50dab8f639dc 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -104,7 +104,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
}
void btrfs_put_root(struct btrfs_root *root);
-void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
+ struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ff8e117a1ace..ea149be28dff 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -105,32 +105,40 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
+/*
+ * Empty an io tree, removing and freeing every extent state record from the
+ * tree. This should be called once we are sure no other task can access the
+ * tree anymore, so no tree updates happen after we empty the tree and there
+ * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
+ * set on any extent state when calling this function).
+ */
void extent_io_tree_release(struct extent_io_tree *tree)
{
+ struct rb_root root;
+ struct extent_state *state;
+ struct extent_state *tmp;
+
spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
+ root = tree->state;
+ tree->state = RB_ROOT;
+ rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
+ /* Clear node to keep free_extent_state() happy. */
RB_CLEAR_NODE(&state->rb_node);
+ ASSERT(!(state->state & EXTENT_LOCKED));
/*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
+ * No need for a memory barrier here, as we are holding the tree
+ * lock and we only change the waitqueue while holding that lock
+ * (see wait_extent_bit()).
*/
ASSERT(!waitqueue_active(&state->wq));
free_extent_state(state);
-
cond_resched_lock(&tree->lock);
}
+ /*
+ * Should still be empty even after a reschedule, no other task should
+ * be accessing the tree anymore.
+ */
+ ASSERT(RB_EMPTY_ROOT(&tree->state));
spin_unlock(&tree->lock);
}
@@ -327,6 +335,36 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
"locking error: extent tree was modified by another thread while locked");
}
+static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *prev;
+
+ prev = prev_state(state);
+ if (prev && prev->end == state->start - 1 && prev->state == state->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, prev);
+ state->start = prev->start;
+ rb_erase(&prev->rb_node, &tree->state);
+ RB_CLEAR_NODE(&prev->rb_node);
+ free_extent_state(prev);
+ }
+}
+
+static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *next;
+
+ next = next_state(state);
+ if (next && next->start == state->end + 1 && next->state == state->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode, state, next);
+ state->end = next->end;
+ rb_erase(&next->rb_node, &tree->state);
+ RB_CLEAR_NODE(&next->rb_node);
+ free_extent_state(next);
+ }
+}
+
/*
* Utility function to look for merge candidates inside a given range. Any
* extents with matching state are merged together into a single extent in the
@@ -338,31 +376,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*/
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
{
- struct extent_state *other;
-
if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
return;
- other = prev_state(state);
- if (other && other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- other = next_state(state);
- if (other && other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
+ merge_prev_state(tree, state);
+ merge_next_state(tree, state);
}
static void set_state_bits(struct extent_io_tree *tree,
@@ -384,19 +402,27 @@ static void set_state_bits(struct extent_io_tree *tree,
* Insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
*
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
+ * Returns a pointer to the struct extent_state record containing the range
+ * requested for insertion, which may be the same as the given struct or it
+ * may be an existing record in the tree that was expanded to accommodate the
+ * requested range. In case of an extent_state different from the one that was
+ * given, the later can be freed or reused by the caller.
+ *
+ * On error it returns an error pointer.
*
* The tree lock is not taken internally. This is a utility function and
* probably isn't what you want to call (see set/clear_extent_bit).
*/
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 bits, struct extent_changeset *changeset)
+static struct extent_state *insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits,
+ struct extent_changeset *changeset)
{
struct rb_node **node;
struct rb_node *parent = NULL;
- const u64 end = state->end;
+ const u64 start = state->start - 1;
+ const u64 end = state->end + 1;
+ const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
set_state_bits(tree, state, bits, changeset);
@@ -407,23 +433,42 @@ static int insert_state(struct extent_io_tree *tree,
parent = *node;
entry = rb_entry(parent, struct extent_state, rb_node);
- if (end < entry->start) {
+ if (state->end < entry->start) {
+ if (try_merge && end == entry->start &&
+ state->state == entry->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
+ entry->start = state->start;
+ merge_prev_state(tree, entry);
+ state->state = 0;
+ return entry;
+ }
node = &(*node)->rb_left;
- } else if (end > entry->end) {
+ } else if (state->end > entry->end) {
+ if (try_merge && entry->end == start &&
+ state->state == entry->state) {
+ if (tree->inode)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
+ entry->end = state->end;
+ merge_next_state(tree, entry);
+ state->state = 0;
+ return entry;
+ }
node = &(*node)->rb_right;
} else {
btrfs_err(tree->fs_info,
"found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, end);
- return -EEXIST;
+ entry->start, entry->end, state->start, state->end);
+ return ERR_PTR(-EEXIST);
}
}
rb_link_node(&state->rb_node, parent, node);
rb_insert_color(&state->rb_node, &tree->state);
- merge_state(tree, state);
- return 0;
+ return state;
}
/*
@@ -708,26 +753,13 @@ out:
}
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
/*
* Wait for one or more bits to clear on a range in the state tree.
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state)
{
struct extent_state *state;
@@ -758,9 +790,15 @@ process_node:
goto out;
if (state->state & bits) {
+ DEFINE_WAIT(wait);
+
start = state->start;
refcount_inc(&state->refs);
- wait_on_state(tree, state);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
free_extent_state(state);
goto again;
}
@@ -847,10 +885,19 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
if (state->end == start - 1 && extent_state_in_tree(state)) {
while ((state = next_state(state)) != NULL) {
if (state->state & bits)
- goto got_it;
+ break;
}
+ /*
+ * If we found the next extent state, clear cached_state
+ * so that we can cache the next extent state below and
+ * avoid future calls going over the same extent state
+ * again. If we haven't found any, clear as well since
+ * it's now useless.
+ */
free_extent_state(*cached_state);
*cached_state = NULL;
+ if (state)
+ goto got_it;
goto out;
}
free_extent_state(*cached_state);
@@ -1133,6 +1180,8 @@ hit_next:
*/
if (state->start > start) {
u64 this_end;
+ struct extent_state *inserted_state;
+
if (end < last_start)
this_end = end;
else
@@ -1148,12 +1197,15 @@ hit_next:
*/
prealloc->start = start;
prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, changeset);
- if (err)
+ inserted_state = insert_state(tree, prealloc, bits, changeset);
+ if (IS_ERR(inserted_state)) {
+ err = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, err);
+ }
- cache_state(prealloc, cached_state);
- prealloc = NULL;
+ cache_state(inserted_state, cached_state);
+ if (inserted_state == prealloc)
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -1356,6 +1408,8 @@ hit_next:
*/
if (state->start > start) {
u64 this_end;
+ struct extent_state *inserted_state;
+
if (end < last_start)
this_end = end;
else
@@ -1373,11 +1427,14 @@ hit_next:
*/
prealloc->start = start;
prealloc->end = this_end;
- err = insert_state(tree, prealloc, bits, NULL);
- if (err)
+ inserted_state = insert_state(tree, prealloc, bits, NULL);
+ if (IS_ERR(inserted_state)) {
+ err = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
+ }
+ cache_state(inserted_state, cached_state);
+ if (inserted_state == prealloc)
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -1640,15 +1697,46 @@ search:
}
/*
- * Search a range in the state tree for a given mask. If 'filled' == 1, this
- * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
- * is returned if any bit in the range is found set.
+ * Check if the single @bit exists in the given range.
+ */
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
+{
+ struct extent_state *state = NULL;
+ bool bitset = false;
+
+ ASSERT(is_power_of_2(bit));
+
+ spin_lock(&tree->lock);
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (state->start > end)
+ break;
+
+ if (state->state & bit) {
+ bitset = true;
+ break;
+ }
+
+ /* If state->end is (u64)-1, start will overflow to 0 */
+ start = state->end + 1;
+ if (start > end || start == 0)
+ break;
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/*
+ * Check if the whole range [@start,@end) contains the single @bit set.
*/
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached)
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached)
{
struct extent_state *state = NULL;
- int bitset = 0;
+ bool bitset = true;
+
+ ASSERT(is_power_of_2(bit));
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1657,35 +1745,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
else
state = tree_search(tree, start);
while (state && start <= end) {
- if (filled && state->start > start) {
- bitset = 0;
+ if (state->start > start) {
+ bitset = false;
break;
}
if (state->start > end)
break;
- if (state->state & bits) {
- bitset = 1;
- if (!filled)
- break;
- } else if (filled) {
- bitset = 0;
+ if ((state->state & bit) == 0) {
+ bitset = false;
break;
}
if (state->end == (u64)-1)
break;
+ /*
+ * Last entry (if state->end is (u64)-1 and overflow happens),
+ * or next entry starts after the range.
+ */
start = state->end + 1;
- if (start > end)
+ if (start > end || start == 0)
break;
state = next_state(state);
}
/* We ran out of states and were still inside of our range. */
- if (filled && !state)
- bitset = 0;
+ if (!state)
+ bitset = false;
spin_unlock(&tree->lock);
return bitset;
}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 28c23a23d121..5602b0137fcd 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -131,8 +131,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
struct extent_state **cached_state);
void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached_state);
+bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached_state);
+bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,7 +193,5 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f356f08b55cb..c8e5b4715b49 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,14 +42,16 @@
#include "file-item.h"
#include "orphan.h"
#include "tree-checker.h"
+#include "raid-stripe-tree.h"
#undef SCRAMBLE_DELAYED_REFS
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
- u64 owner_offset, int refs_to_drop,
+ u64 owner_offset,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -57,7 +59,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
- struct btrfs_key *ins, int ref_mod);
+ struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
@@ -344,9 +346,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
int type = btrfs_extent_inline_ref_type(eb, iref);
u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ return type;
+ }
+
if (type == BTRFS_TREE_BLOCK_REF_KEY ||
type == BTRFS_SHARED_BLOCK_REF_KEY ||
type == BTRFS_SHARED_DATA_REF_KEY ||
@@ -355,26 +363,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_TREE_BLOCK_REF_KEY)
return type;
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
- ASSERT(eb->fs_info);
+ ASSERT(fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
- if (offset &&
- IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
if (type == BTRFS_EXTENT_DATA_REF_KEY)
return type;
if (type == BTRFS_SHARED_DATA_REF_KEY) {
- ASSERT(eb->fs_info);
+ ASSERT(fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ IS_ALIGNED(offset, fs_info->sectorsize))
return type;
}
} else {
@@ -385,7 +392,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
WARN_ON(1);
btrfs_print_leaf(eb);
- btrfs_err(eb->fs_info,
+ btrfs_err(fs_info,
"eb %llu iref 0x%lx invalid extent inline ref type %d",
eb->start, (unsigned long)iref, type);
@@ -399,11 +406,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -575,7 +582,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
fail:
btrfs_release_path(path);
@@ -623,7 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
return ret;
}
@@ -789,7 +796,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
int type;
int want;
int ret;
- int err = 0;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
int needed;
@@ -816,10 +822,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
/*
* We may be a newly converted file system which still has the old fat
@@ -846,7 +850,7 @@ again:
}
if (ret && !insert) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out;
} else if (WARN_ON(ret)) {
btrfs_print_leaf(path->nodes[0]);
@@ -854,18 +858,18 @@ again:
"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
bytenr, num_bytes, parent, root_objectid, owner,
offset);
- err = -EIO;
+ ret = -EUCLEAN;
goto out;
}
leaf = path->nodes[0];
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %llu expect >= %zu",
item_size, sizeof(*ei));
- btrfs_abort_transaction(trans, err);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -885,22 +889,17 @@ again:
else
needed = BTRFS_REF_TYPE_BLOCK;
- err = -ENOENT;
- while (1) {
- if (ptr >= end) {
- if (ptr > end) {
- err = -EUCLEAN;
- btrfs_print_leaf(path->nodes[0]);
- btrfs_crit(fs_info,
-"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
- path->slots[0], root_objectid, owner, offset, parent);
- }
- break;
- }
+ ret = -ENOENT;
+ while (ptr < end) {
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ ptr += btrfs_extent_inline_ref_size(type);
+ continue;
+ }
if (type == BTRFS_REF_TYPE_INVALID) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
goto out;
}
@@ -916,7 +915,7 @@ again:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (match_extent_data_ref(leaf, dref, root_objectid,
owner, offset)) {
- err = 0;
+ ret = 0;
break;
}
if (hash_extent_data_ref_item(leaf, dref) <
@@ -927,14 +926,14 @@ again:
ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
if (parent > 0) {
if (parent == ref_offset) {
- err = 0;
+ ret = 0;
break;
}
if (ref_offset < parent)
break;
} else {
if (root_objectid == ref_offset) {
- err = 0;
+ ret = 0;
break;
}
if (ref_offset < root_objectid)
@@ -943,10 +942,20 @@ again:
}
ptr += btrfs_extent_inline_ref_size(type);
}
- if (err == -ENOENT && insert) {
+
+ if (unlikely(ptr > end)) {
+ ret = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ goto out;
+ }
+
+ if (ret == -ENOENT && insert) {
if (item_size + extra_size >=
BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
/*
@@ -958,7 +967,7 @@ again:
if (find_next_key(path, 0, &key) == 0 &&
key.objectid == bytenr &&
key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out;
}
}
@@ -969,14 +978,14 @@ out:
path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
- return err;
+ return ret;
}
/*
* helper to add new inline back ref
*/
static noinline_for_stack
-void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
+void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
u64 parent, u64 root_objectid,
@@ -999,7 +1008,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
- btrfs_extend_item(path, size);
+ btrfs_extend_item(trans, path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1033,7 +1042,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
} else {
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1066,7 +1075,9 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
/*
* helper to update/remove inline back ref
*/
-static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path,
+static noinline_for_stack int update_inline_extent_backref(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
struct btrfs_delayed_extent_op *extent_op)
@@ -1174,9 +1185,9 @@ static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *pa
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
- btrfs_truncate_item(path, item_size, 1);
+ btrfs_truncate_item(trans, path, item_size, 1);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
return 0;
}
@@ -1206,9 +1217,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
bytenr, num_bytes, root_objectid, path->slots[0]);
return -EUCLEAN;
}
- ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op);
+ ret = update_inline_extent_backref(trans, path, iref,
+ refs_to_add, extent_op);
} else if (ret == -ENOENT) {
- setup_inline_extent_backref(trans->fs_info, path, iref, parent,
+ setup_inline_extent_backref(trans, path, iref, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
ret = 0;
@@ -1226,7 +1238,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
BUG_ON(!is_data && refs_to_drop != 1);
if (iref)
- ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ ret = update_inline_extent_backref(trans, path, iref,
+ -refs_to_drop, NULL);
else if (is_data)
ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
else
@@ -1422,7 +1435,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
- generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
+ generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -1435,7 +1448,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
}
/*
- * __btrfs_inc_extent_ref - insert backreference for a given extent
+ * Insert backreference for a given extent.
*
* The counterpart is in __btrfs_free_extent(), with examples and more details
* how it works.
@@ -1465,8 +1478,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* always passed as 0. For data extents it is the fileoffset
* this extent belongs to.
*
- * @refs_to_add Number of references to add
- *
* @extent_op Pointer to a structure, holding information necessary when
* updating a tree block's flags
*
@@ -1474,7 +1485,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add,
+ u64 owner, u64 offset,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_path *path;
@@ -1484,6 +1495,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
u64 refs;
+ int refs_to_add = node->ref_mod;
int ret;
path = btrfs_alloc_path();
@@ -1510,19 +1522,18 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/* now insert the actual backref */
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- BUG_ON(refs_to_add != 1);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID)
ret = insert_tree_block_ref(trans, path, bytenr, parent,
root_objectid);
- } else {
+ else
ret = insert_extent_data_ref(trans, path, bytenr, parent,
root_objectid, owner, offset,
refs_to_add);
- }
+
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -1531,44 +1542,57 @@ out:
}
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
int ret = 0;
struct btrfs_delayed_data_ref *ref;
- struct btrfs_key ins;
u64 parent = 0;
- u64 ref_root = 0;
u64 flags = 0;
- ins.objectid = node->bytenr;
- ins.offset = node->num_bytes;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
ref = btrfs_delayed_node_to_data_ref(node);
trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
- ref_root = ref->root;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ struct btrfs_key key;
+ struct btrfs_squota_delta delta = {
+ .root = href->owning_root,
+ .num_bytes = node->num_bytes,
+ .rsv_bytes = href->reserved_bytes,
+ .is_data = true,
+ .is_inc = true,
+ .generation = trans->transid,
+ };
+
if (extent_op)
flags |= extent_op->flags_to_set;
- ret = alloc_reserved_file_extent(trans, parent, ref_root,
+
+ key.objectid = node->bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = node->num_bytes;
+
+ ret = alloc_reserved_file_extent(trans, parent, ref->root,
flags, ref->objectid,
- ref->offset, &ins,
- node->ref_mod);
+ ref->offset, &key,
+ node->ref_mod, href->owning_root);
+ if (!ret)
+ ret = btrfs_record_squota_delta(trans->fs_info, &delta);
+ else
+ btrfs_qgroup_free_refroot(trans->fs_info, delta.root,
+ delta.rsv_bytes, BTRFS_QGROUP_RSV_DATA);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
+ ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
ref->objectid, ref->offset,
- node->ref_mod, extent_op);
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, node, parent,
- ref_root, ref->objectid,
- ref->offset, node->ref_mod,
- extent_op);
+ ret = __btrfs_free_extent(trans, href, node, parent,
+ ref->root, ref->objectid,
+ ref->offset, extent_op);
} else {
BUG();
}
@@ -1605,7 +1629,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
u32 item_size;
int ret;
- int err = 0;
int metadata = 1;
if (TRANS_ABORTED(trans))
@@ -1632,10 +1655,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- err = ret;
goto out;
- }
- if (ret > 0) {
+ } else if (ret > 0) {
if (metadata) {
if (path->slots[0] > 0) {
path->slots[0]--;
@@ -1656,7 +1677,10 @@ again:
goto again;
}
} else {
- err = -EIO;
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "missing extent item for extent %llu num_bytes %llu level %d",
+ head->bytenr, head->num_bytes, extent_op->level);
goto out;
}
}
@@ -1665,29 +1689,31 @@ again:
item_size = btrfs_item_size(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- btrfs_abort_transaction(trans, err);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
__run_delayed_extent_op(extent_op, leaf, ei);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
int ret = 0;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_tree_ref *ref;
u64 parent = 0;
u64 ref_root = 0;
@@ -1699,22 +1725,33 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent = ref->parent;
ref_root = ref->root;
- if (node->ref_mod != 1) {
+ if (unlikely(node->ref_mod != 1)) {
btrfs_err(trans->fs_info,
- "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
+ "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
node->bytenr, node->ref_mod, node->action, ref_root,
parent);
- return -EIO;
+ return -EUCLEAN;
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+ struct btrfs_squota_delta delta = {
+ .root = href->owning_root,
+ .num_bytes = fs_info->nodesize,
+ .rsv_bytes = 0,
+ .is_data = false,
+ .is_inc = true,
+ .generation = trans->transid,
+ };
+
BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, node, extent_op);
+ if (!ret)
+ btrfs_record_squota_delta(fs_info, &delta);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, node, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ret = __btrfs_free_extent(trans, href, node, parent, ref_root,
+ ref->level, 0, extent_op);
} else {
BUG();
}
@@ -1723,6 +1760,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
@@ -1737,12 +1775,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
- ret = run_delayed_tree_ref(trans, node, extent_op,
+ ret = run_delayed_tree_ref(trans, href, node, extent_op,
insert_reserved);
else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
- ret = run_delayed_data_ref(trans, node, extent_op,
+ ret = run_delayed_data_ref(trans, href, node, extent_op,
insert_reserved);
+ else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
+ ret = 0;
else
BUG();
if (ret && insert_reserved)
@@ -1821,28 +1861,37 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
return ret ? ret : 1;
}
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- int nr_items = 1; /* Dropping this ref head update. */
-
/*
* We had csum deletions accounted for in our delayed refs rsv, we need
* to drop the csum leaves for this update from our delayed_refs_rsv.
*/
if (head->total_ref_mod < 0 && head->is_data) {
+ int nr_csums;
+
spin_lock(&delayed_refs->lock);
delayed_refs->pending_csums -= head->num_bytes;
spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+
+ btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);
+
+ return btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
}
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
+ head->must_insert_reserved && head->is_data)
+ btrfs_qgroup_free_refroot(fs_info, head->owning_root,
+ head->reserved_bytes, BTRFS_QGROUP_RSV_DATA);
- btrfs_delayed_refs_rsv_release(fs_info, nr_items);
+ return 0;
}
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head)
+ struct btrfs_delayed_ref_head *head,
+ u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1887,7 +1936,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
}
}
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
@@ -1929,7 +1978,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
}
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *locked_ref)
+ struct btrfs_delayed_ref_head *locked_ref,
+ u64 *bytes_released)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -1983,8 +2033,10 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
locked_ref->extent_op = NULL;
spin_unlock(&locked_ref->lock);
- ret = run_one_delayed_ref(trans, ref, extent_op,
+ ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
must_insert_reserved);
+ btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
+ *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
@@ -2008,15 +2060,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long nr)
+ u64 min_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *locked_ref = NULL;
int ret;
unsigned long count = 0;
+ unsigned long max_count = 0;
+ u64 bytes_processed = 0;
delayed_refs = &trans->transaction->delayed_refs;
+ if (min_bytes == 0) {
+ max_count = delayed_refs->num_heads_ready;
+ min_bytes = U64_MAX;
+ }
+
do {
if (!locked_ref) {
locked_ref = btrfs_obtain_ref_head(trans);
@@ -2044,7 +2103,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
- ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
if (ret < 0 && ret != -EAGAIN) {
/*
* Error, btrfs_run_delayed_refs_for_head already
@@ -2056,7 +2115,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* Success, perform the usual cleanup of a processed
* head
*/
- ret = cleanup_ref_head(trans, locked_ref);
+ ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
if (ret > 0 ) {
/* We dropped our lock, we need to loop. */
ret = 0;
@@ -2073,7 +2132,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
locked_ref = NULL;
cond_resched();
- } while ((nr != -1 && count < nr) || locked_ref);
+ } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
+ (max_count > 0 && count < max_count) ||
+ locked_ref);
return 0;
}
@@ -2122,24 +2183,25 @@ static u64 find_middle(struct rb_root *root)
#endif
/*
- * this starts processing the delayed reference count updates and
- * extent insertions we have queued up so far. count can be
- * 0, which means to process everything in the tree at the start
- * of the run (but not newly added entries), or it can be some target
- * number you'd like to process.
+ * Start processing the delayed reference count updates and extent insertions
+ * we have queued up so far.
+ *
+ * @trans: Transaction handle.
+ * @min_bytes: How many bytes of delayed references to process. After this
+ * many bytes we stop processing delayed references if there are
+ * any more. If 0 it means to run all existing delayed references,
+ * but not new ones added after running all existing ones.
+ * Use (u64)-1 (U64_MAX) to run all existing delayed references
+ * plus any new ones that are added.
*
* Returns 0 on success or if called with an aborted transaction
* Returns <0 on error and aborts the transaction
*/
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long count)
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_head *head;
int ret;
- int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
if (TRANS_ABORTED(trans))
@@ -2149,42 +2211,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
return 0;
delayed_refs = &trans->transaction->delayed_refs;
- if (count == 0)
- count = delayed_refs->num_heads_ready;
-
again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- ret = __btrfs_run_delayed_refs(trans, count);
+ ret = __btrfs_run_delayed_refs(trans, min_bytes);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
}
- if (run_all) {
+ if (min_bytes == U64_MAX) {
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- node = rb_first_cached(&delayed_refs->href_root);
- if (!node) {
+ if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
spin_unlock(&delayed_refs->lock);
- goto out;
+ return 0;
}
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
- /* Mutex was contended, block until it's released and retry. */
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
-
- btrfs_put_delayed_ref_head(head);
cond_resched();
goto again;
}
-out:
+
return 0;
}
@@ -2309,6 +2359,7 @@ static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_extent_item *ei;
struct btrfs_key key;
u32 item_size;
+ u32 expected_size;
int type;
int ret;
@@ -2335,10 +2386,22 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ret = 1;
item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
+
+ /* No inline refs; we need to bail before checking for owner ref. */
+ if (item_size == sizeof(*ei))
+ goto out;
+
+ /* Check for an owner ref; skip over it to the real inline refs. */
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
+ if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+ iref = (struct btrfs_extent_inline_ref *)(iref + 1);
+ }
/* If extent item has more than 1 inline ref then it's shared */
- if (item_size != sizeof(*ei) +
- btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+ if (item_size != expected_size)
goto out;
/*
@@ -2350,8 +2413,6 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_root_last_snapshot(&root->root_item)))
goto out;
- iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-
/* If this extent has SHARED_DATA_REF then it's shared */
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
@@ -2448,7 +2509,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent);
+ num_bytes, parent, ref_root);
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
key.offset, root->root_key.objectid,
for_reloc);
@@ -2461,8 +2522,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = fs_info->nodesize;
+ /* We don't know the owning_root, use 0. */
btrfs_init_generic_ref(&generic_ref, action, bytenr,
- num_bytes, parent);
+ num_bytes, parent, 0);
btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
root->root_key.objectid, for_reloc);
if (inc)
@@ -2563,16 +2625,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * this function must be called within transaction
- */
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes)
+ const struct extent_buffer *eb)
{
struct btrfs_block_group *cache;
int ret;
- cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
if (!cache)
return -EINVAL;
@@ -2584,10 +2643,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- pin_down_extent(trans, cache, bytenr, num_bytes, 0);
+ pin_down_extent(trans, cache, eb->start, eb->len, 0);
/* remove us from the free space cache (if we're there at all) */
- ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+ ret = btrfs_remove_free_space(cache, eb->start, eb->len);
out:
btrfs_put_block_group(cache);
return ret;
@@ -2842,12 +2901,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Parse an extent item's inline extents looking for a simple quotas owner ref.
+ *
+ * @fs_info: the btrfs_fs_info for this mount
+ * @leaf: a leaf in the extent tree containing the extent item
+ * @slot: the slot in the leaf where the extent item is found
+ *
+ * Returns the objectid of the root that originally allocated the extent item
+ * if the inline owner ref is expected and present, otherwise 0.
+ *
+ * If an extent item has an owner ref item, it will be the first inline ref
+ * item. Therefore the logic is to check whether there are any inline ref
+ * items, then check the type of the first one.
+ */
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot)
+{
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_extent_owner_ref *oref;
+ unsigned long ptr;
+ unsigned long end;
+ int type;
+
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
+ return 0;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + btrfs_item_size(leaf, slot);
+
+ /* No inline ref items of any kind, can't check type. */
+ if (ptr == end)
+ return 0;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
+
+ /* We found an owner ref, get the root out of it. */
+ if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ return btrfs_extent_owner_ref_root_id(leaf, oref);
+ }
+
+ /* We have inline refs, but not an owner ref. */
+ return 0;
+}
+
static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, bool is_data)
+ u64 bytenr, struct btrfs_squota_delta *delta)
{
int ret;
+ u64 num_bytes = delta->num_bytes;
- if (is_data) {
+ if (delta->is_data) {
struct btrfs_root *csum_root;
csum_root = btrfs_csum_root(trans->fs_info, bytenr);
@@ -2856,6 +2964,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
+
+ ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = btrfs_record_squota_delta(trans->fs_info, delta);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
}
ret = add_to_free_space_tree(trans, bytenr, num_bytes);
@@ -2938,9 +3058,10 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
* And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
- u64 owner_offset, int refs_to_drop,
+ u64 owner_offset,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -2955,11 +3076,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
+ int refs_to_drop = node->ref_mod;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
+ u64 delayed_ref_root = href->owning_root;
extent_root = btrfs_extent_root(info, bytenr);
ASSERT(extent_root);
@@ -3149,7 +3272,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
} else {
btrfs_set_extent_refs(leaf, ei, refs);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
@@ -3160,6 +3283,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ struct btrfs_squota_delta delta = {
+ .root = delayed_ref_root,
+ .num_bytes = num_bytes,
+ .rsv_bytes = 0,
+ .is_data = is_data,
+ .is_inc = false,
+ .generation = btrfs_extent_generation(leaf, ei),
+ };
+
/* In this branch refs == 1 */
if (found_extent) {
if (is_data && refs_to_drop !=
@@ -3198,6 +3330,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
num_to_del = 2;
}
}
+ /*
+ * We can't infer the data owner from the delayed ref, so we need
+ * to try to get it from the owning ref item.
+ *
+ * If it is not present, then that extent was not written under
+ * simple quotas mode, so we don't need to account for its deletion.
+ */
+ if (is_data)
+ delta.root = btrfs_get_extent_owner_root(trans->fs_info,
+ leaf, extent_slot);
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
@@ -3207,7 +3349,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
+ ret = do_free_extent_accounting(trans, bytenr, &delta);
}
btrfs_release_path(path);
@@ -3281,7 +3423,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
- buf->start, buf->len, parent);
+ buf->start, buf->len, parent, btrfs_header_owner(buf));
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
root_id, 0, false);
@@ -3368,10 +3510,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
- /* unlocks the pinned mutex */
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
@@ -3381,9 +3522,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
}
if (!((ref->type == BTRFS_REF_METADATA &&
- ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
+ ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
- ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
+ ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
return ret;
@@ -4440,8 +4581,8 @@ loop:
}
/*
- * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
- * hole that is at least as big as @num_bytes.
+ * Entry point to the extent allocator. Tries to find a hole that is at least
+ * as big as @num_bytes.
*
* @root - The root that will contain this extent
*
@@ -4560,20 +4701,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
- u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+ const struct extent_buffer *eb)
{
struct btrfs_block_group *cache;
int ret = 0;
- cache = btrfs_lookup_block_group(trans->fs_info, start);
+ cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
if (!cache) {
btrfs_err(trans->fs_info, "unable to find block group for %llu",
- start);
+ eb->start);
return -ENOSPC;
}
- ret = pin_down_extent(trans, cache, start, len, 1);
+ ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
btrfs_put_block_group(cache);
return ret;
}
@@ -4603,24 +4744,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
- struct btrfs_key *ins, int ref_mod)
+ struct btrfs_key *ins, int ref_mod, u64 oref_root)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *extent_root;
int ret;
struct btrfs_extent_item *extent_item;
+ struct btrfs_extent_owner_ref *oref;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
int type;
u32 size;
+ const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);
if (parent > 0)
type = BTRFS_SHARED_DATA_REF_KEY;
else
type = BTRFS_EXTENT_DATA_REF_KEY;
- size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
+ size = sizeof(*extent_item);
+ if (simple_quota)
+ size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
+ size += btrfs_extent_inline_ref_size(type);
path = btrfs_alloc_path();
if (!path)
@@ -4642,7 +4788,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
flags | BTRFS_EXTENT_FLAG_DATA);
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ if (simple_quota) {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
+ iref = (struct btrfs_extent_inline_ref *)(oref + 1);
+ }
btrfs_set_extent_inline_ref_type(leaf, iref, type);
+
if (parent > 0) {
struct btrfs_shared_data_ref *ref;
ref = (struct btrfs_shared_data_ref *)(iref + 1);
@@ -4657,7 +4810,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
}
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_free_path(path);
return alloc_reserved_extent(trans, ins->objectid, ins->offset);
@@ -4732,7 +4885,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
@@ -4744,12 +4897,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 };
+ u64 root_objectid = root->root_key.objectid;
+ u64 owning_root = root_objectid;
+
+ BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
- BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+ if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+ owning_root = root->relocation_src_root;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins->objectid, ins->offset, 0);
- btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+ ins->objectid, ins->offset, 0, owning_root);
+ btrfs_init_data_ref(&generic_ref, root_objectid, owner,
offset, 0, false);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
@@ -4769,6 +4927,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
+ struct btrfs_squota_delta delta = {
+ .root = root_objectid,
+ .num_bytes = ins->offset,
+ .generation = trans->transid,
+ .rsv_bytes = 0,
+ .is_data = true,
+ .is_inc = true,
+ };
/*
* Mixed block groups will exclude before processing the log so we only
@@ -4794,13 +4960,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
spin_unlock(&space_info->lock);
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
- offset, ins, 1);
+ offset, ins, 1, root_objectid);
if (ret)
btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
+ ret = btrfs_record_squota_delta(fs_info, &delta);
btrfs_put_block_group(block_group);
return ret;
}
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extra safety check in case the extent tree is corrupted and extent allocator
+ * chooses to use a tree block which is already used and locked.
+ */
+static bool check_eb_lock_owner(const struct extent_buffer *eb)
+{
+ if (eb->lock_owner == current->pid) {
+ btrfs_err_rl(eb->fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ eb->start, btrfs_header_owner(eb), current->pid);
+ return true;
+ }
+ return false;
+}
+#else
+static bool check_eb_lock_owner(struct extent_buffer *eb)
+{
+ return false;
+}
+#endif
+
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, int level, u64 owner,
@@ -4814,15 +5003,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
- /*
- * Extra safety check in case the extent tree is corrupted and extent
- * allocator chooses to use a tree block which is already used and
- * locked.
- */
- if (buf->lock_owner == current->pid) {
- btrfs_err_rl(fs_info,
-"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
- buf->start, btrfs_header_owner(buf), current->pid);
+ if (check_eb_lock_owner(buf)) {
free_extent_buffer(buf);
return ERR_PTR(-EUCLEAN);
}
@@ -4899,6 +5080,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
+ u64 reloc_src_root,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4911,6 +5093,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
int ret;
u32 blocksize = fs_info->nodesize;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+ u64 owning_root;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
@@ -4937,11 +5120,13 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
ret = PTR_ERR(buf);
goto out_free_reserved;
}
+ owning_root = btrfs_header_owner(buf);
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
parent = ins.objectid;
flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ owning_root = reloc_src_root;
} else
BUG_ON(parent > 0);
@@ -4961,7 +5146,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
- ins.objectid, ins.offset, parent);
+ ins.objectid, ins.offset, parent, owning_root);
btrfs_init_tree_ref(&generic_ref, level, root_objectid,
root->root_key.objectid, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -5382,7 +5567,8 @@ skip:
find_next_key(path, level, &wc->drop_progress);
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- fs_info->nodesize, parent);
+ fs_info->nodesize, parent,
+ btrfs_header_owner(next));
btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
0, false);
ret = btrfs_free_extent(trans, &ref);
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 88c249c37516..0716f65d9753 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -7,6 +7,7 @@
#include "block-group.h"
struct btrfs_free_cluster;
+struct btrfs_delayed_ref_head;
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
@@ -91,8 +92,8 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
-void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes);
+u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
@@ -102,7 +103,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes);
+ const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict,
@@ -113,6 +114,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
+ u64 reloc_src_root,
enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 root_id,
@@ -136,12 +138,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
+u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
+ const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ac3fca5a5e41..03cef28d9e37 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -21,7 +21,6 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "bio.h"
-#include "check-integrity.h"
#include "locking.h"
#include "rcu-string.h"
#include "backref.h"
@@ -395,7 +394,7 @@ again:
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, 1, cached_state);
+ EXTENT_DELALLOC, cached_state);
if (!ret) {
unlock_extent(tree, delalloc_start, delalloc_end,
&cached_state);
@@ -484,10 +483,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)
bvec->bv_offset, bvec->bv_len);
btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
- if (error) {
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ if (error)
mapping_set_error(page->mapping, error);
- }
btrfs_page_clear_writeback(fs_info, page, start, len);
}
@@ -1456,8 +1453,6 @@ done:
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start,
PAGE_SIZE, !ret);
- btrfs_page_clear_uptodate(btrfs_sb(inode->i_sb), page,
- page_start, PAGE_SIZE);
mapping_set_error(page->mapping, ret);
}
unlock_page(page);
@@ -1624,8 +1619,6 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
struct page *page = bvec->bv_page;
u32 len = bvec->bv_len;
- if (!uptodate)
- btrfs_page_clear_uptodate(fs_info, page, start, len);
btrfs_page_clear_writeback(fs_info, page, start, len);
bio_offset += len;
}
@@ -2201,7 +2194,6 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
cur, cur_len, !ret);
- btrfs_page_clear_uptodate(fs_info, page, cur, cur_len);
mapping_set_error(page->mapping, ret);
}
btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
@@ -2301,7 +2293,7 @@ static int try_release_extent_state(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
int ret = 1;
- if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
+ if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
ret = 0;
} else {
u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
@@ -2360,9 +2352,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
free_extent_map(em);
break;
}
- if (test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL))
+ if (test_range_bit_exists(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED))
goto next;
/*
* If it's not in the list of modified extents, used
@@ -3462,6 +3454,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
start, fs_info->nodesize);
return -EINVAL;
}
+ if (!IS_ALIGNED(start, fs_info->nodesize) &&
+ !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
+ btrfs_warn(fs_info,
+"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
+ start, fs_info->nodesize);
+ }
return 0;
}
@@ -4002,8 +4000,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
char *dst = (char *)dstv;
unsigned long i = get_eb_page_index(start);
- if (check_eb_range(eb, start, len))
+ if (check_eb_range(eb, start, len)) {
+ /*
+ * Invalid range hit, reset the memory, so callers won't get
+ * some random garbage for their uninitialzed memory.
+ */
+ memset(dstv, 0, len);
return;
+ }
offset = get_eb_offset_in_page(eb, start);
@@ -4249,14 +4253,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
}
/*
- * eb_bitmap_offset() - calculate the page and offset of the byte containing the
- * given bit number
- * @eb: the extent buffer
- * @start: offset of the bitmap item in the extent buffer
- * @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains the
- * given bit number
- * @page_offset: return offset into the page given by page_index
+ * Calculate the page and offset of the byte containing the given bit number.
+ *
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains
+ * the given bit number
+ * @page_offset: return offset into the page given by page_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
@@ -4615,7 +4619,8 @@ int try_release_extent_buffer(struct page *page)
}
/*
- * btrfs_readahead_tree_block - attempt to readahead a child block
+ * Attempt to readahead a child block.
+ *
* @fs_info: the fs_info
* @bytenr: bytenr to read
* @owner_root: objectid of the root that owns this eb
@@ -4654,7 +4659,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_readahead_node_child - readahead a node's child block
+ * Readahead a node's child block.
+ *
* @node: parent node we're reading from
* @slot: slot in the parent node for the child we want to read
*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 68368ba99321..2171057a4477 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -80,16 +80,16 @@ struct extent_buffer {
spinlock_t refs_lock;
atomic_t refs;
int read_mirror;
- struct rcu_head rcu_head;
- pid_t lock_owner;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
+ struct rcu_head rcu_head;
struct rw_semaphore lock;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
+ pid_t lock_owner;
#endif
};
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ce5dd154499..45cae356e89b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -194,7 +194,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_encryption(leaf, item, 0);
btrfs_set_file_extent_other_encoding(leaf, item, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
return ret;
@@ -811,11 +811,12 @@ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
* This calls btrfs_truncate_item with the correct args based on the overlap,
* and fixes up the key as required.
*/
-static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
+static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_key *key,
u64 bytenr, u64 len)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_buffer *leaf;
const u32 csum_size = fs_info->csum_size;
u64 csum_end;
@@ -836,7 +837,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(path, new_size, 1);
+ btrfs_truncate_item(trans, path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -848,10 +849,10 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
- btrfs_truncate_item(path, new_size, 0);
+ btrfs_truncate_item(trans, path, new_size, 0);
key->offset = end_byte;
- btrfs_set_item_key_safe(fs_info, path, key);
+ btrfs_set_item_key_safe(trans, path, key);
} else {
BUG();
}
@@ -994,7 +995,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
} else {
- truncate_one_csum(fs_info, path, &key, bytenr, len);
+ truncate_one_csum(trans, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
@@ -1202,7 +1203,7 @@ extend_csum:
diff /= csum_size;
diff *= csum_size;
- btrfs_extend_item(path, diff);
+ btrfs_extend_item(trans, path, diff);
ret = 0;
goto csum;
}
@@ -1249,7 +1250,7 @@ found:
ins_size /= csum_size;
total_bytes += ins_size * fs_info->sectorsize;
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
if (total_bytes < sums->len) {
btrfs_release_path(path);
cond_resched();
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ca46a529d56b..f47731c45bb5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -17,6 +17,7 @@
#include <linux/uio.h>
#include <linux/iversion.h>
#include <linux/fsverity.h>
+#include <linux/iomap.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -368,12 +369,13 @@ next_slot:
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->start);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
- disk_bytenr, num_bytes, 0);
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
@@ -405,13 +407,13 @@ next_slot:
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = args->end;
- btrfs_set_item_key_safe(fs_info, path, &new_key);
+ btrfs_set_item_key_safe(trans, path, &new_key);
extent_offset += args->end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->end);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += args->end - key.offset;
break;
@@ -431,7 +433,7 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
args->start - key.offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (update_refs && disk_bytenr > 0)
args->bytes_found += extent_end - args->start;
if (args->end == extent_end)
@@ -463,7 +465,8 @@ delete_extent_item:
} else if (update_refs && disk_bytenr > 0) {
btrfs_init_generic_ref(&ref,
BTRFS_DROP_DELAYED_REF,
- disk_bytenr, num_bytes, 0);
+ disk_bytenr, num_bytes, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
@@ -536,7 +539,8 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
+ btrfs_setup_item_for_insert(trans, root, path, &key,
+ args->extent_item_size);
args->extent_inserted = true;
}
@@ -593,7 +597,6 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_path *path;
@@ -664,7 +667,7 @@ again:
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
- btrfs_set_item_key_safe(fs_info, path, &new_key);
+ btrfs_set_item_key_safe(trans, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
@@ -679,7 +682,7 @@ again:
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -698,7 +701,7 @@ again:
trans->transid);
path->slots[0]++;
new_key.offset = start;
- btrfs_set_item_key_safe(fs_info, path, &new_key);
+ btrfs_set_item_key_safe(trans, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -708,7 +711,7 @@ again:
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
start - orig_offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
}
@@ -742,10 +745,10 @@ again:
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
- num_bytes, 0);
+ num_bytes, 0, root->root_key.objectid);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
orig_offset, 0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
@@ -771,7 +774,7 @@ again:
other_start = end;
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, 0);
+ num_bytes, 0, root->root_key.objectid);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
0, false);
if (extent_mergeable(leaf, path->slots[0] + 1,
@@ -814,7 +817,7 @@ again:
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
@@ -823,7 +826,7 @@ again:
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret < 0) {
@@ -1106,6 +1109,26 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec64 now, ts;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_time(inode);
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(&ts, &now))
+ inode_set_mtime_to_ts(inode, now);
+
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(&ts, &now))
+ inode_set_ctime_to_ts(inode, now);
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
size_t count)
{
@@ -1137,10 +1160,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write.
*/
- if (!IS_NOCMTIME(inode)) {
- inode->i_mtime = inode_set_ctime_current(inode);
- inode_inc_iversion(inode);
- }
+ update_time_for_write(inode);
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
@@ -1451,8 +1471,13 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
- /* If the write DIO is within EOF, use a shared lock */
- if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
+ /*
+ * If the write DIO is within EOF, use a shared lock and also only if
+ * security bits will likely not be dropped by file_remove_privs() called
+ * from btrfs_write_check(). Either will need to be rechecked after the
+ * lock was acquired.
+ */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
@@ -1460,6 +1485,13 @@ relock:
if (err < 0)
return err;
+ /* Shared lock cannot be used with security bits set. */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
err = generic_write_checks(iocb, from);
if (err <= 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
@@ -1718,7 +1750,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
struct btrfs_inode *inode = BTRFS_I(ctx->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
list_empty(&ctx->ordered_extents))
return true;
@@ -1729,7 +1761,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
* and for a fast fsync we don't wait for that, we only wait for the
* writeback to complete.
*/
- if (inode->last_trans <= fs_info->last_trans_committed &&
+ if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
(test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
list_empty(&ctx->ordered_extents)))
return true;
@@ -1858,7 +1890,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
- smp_mb();
if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
@@ -2076,7 +2107,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
@@ -2084,7 +2115,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
u64 num_bytes;
key.offset = offset;
- btrfs_set_item_key_safe(fs_info, path, &key);
+ btrfs_set_item_key_safe(trans, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2093,7 +2124,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
goto out;
}
btrfs_release_path(path);
@@ -2245,7 +2276,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
if (extent_info->is_new_extent)
btrfs_set_file_extent_generation(leaf, extent, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
@@ -2275,7 +2306,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
extent_info->disk_offset,
- extent_info->disk_len, 0);
+ extent_info->disk_len, 0,
+ root->root_key.objectid);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
btrfs_ino(inode), ref_offset, 0, false);
@@ -2445,9 +2477,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
inode_inc_iversion(&inode->vfs_inode);
if (!extent_info || extent_info->update_times)
- inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
+ inode_set_mtime_to_ts(&inode->vfs_inode,
+ inode_set_ctime_current(&inode->vfs_inode));
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
break;
@@ -2686,8 +2719,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -2706,14 +2739,14 @@ out_only_mutex:
struct timespec64 now = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
- inode->i_mtime = now;
+ inode_set_mtime_to_ts(inode, now);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
int ret2;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
if (!ret)
ret = ret2;
@@ -2780,7 +2813,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode_set_ctime_current(inode);
i_size_write(inode, end);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
return ret ? ret : ret2;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 27fad70451aa..6f93c9a2c3e3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -57,6 +57,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, bool update_stats);
+static void btrfs_crc32c_final(u32 crc, u8 *result)
+{
+ put_unaligned_le32(~crc, result);
+}
+
static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
@@ -195,7 +200,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -213,7 +218,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header);
memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
btrfs_set_free_space_key(leaf, header, &disk_key);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
@@ -354,7 +359,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
if (ret)
goto fail;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
fail:
if (locked)
@@ -540,7 +545,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
- crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
@@ -562,7 +567,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
val = *tmp;
io_ctl_map_page(io_ctl, 0);
- crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
+ crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->fs_info,
@@ -1185,7 +1190,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_set_free_space_entries(leaf, header, entries);
btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
btrfs_set_free_space_generation(leaf, header, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
return 0;
@@ -1321,7 +1326,7 @@ out:
"failed to write free space cache for block group %llu error %d",
block_group->start, ret);
}
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_update_inode(trans, BTRFS_I(inode));
if (block_group) {
/* the dirty list is protected by the dirty_bgs_lock */
@@ -1362,7 +1367,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
/*
* Write out cached info to an inode.
*
- * @root: root the inode belongs to
* @inode: freespace inode we are writing out
* @ctl: free space cache we are going to write out
* @block_group: block_group for this cache if it belongs to a block_group
@@ -1373,7 +1377,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
* on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_write_out_cache(struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
@@ -1506,7 +1510,7 @@ out:
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ btrfs_update_inode(trans, BTRFS_I(inode));
if (must_iput)
iput(inode);
return ret;
@@ -1532,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return 0;
- ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
- block_group, &block_group->io_ctl, trans);
+ ret = __btrfs_write_out_cache(inode, ctl, block_group,
+ &block_group->io_ctl, trans);
if (ret) {
btrfs_debug(fs_info,
"failed to write free space cache for block group %llu error %d",
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index c0e734082dcc..7b598b070700 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -89,7 +89,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -287,7 +287,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
if (extent_count != expected_extent_count) {
@@ -324,7 +324,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
i += extent_size;
@@ -430,7 +430,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
@@ -495,7 +495,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
extent_count += new_extents;
btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
@@ -533,7 +533,8 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
return !!extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_block_group *block_group,
+static void free_space_set_bits(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 *start, u64 *size,
int bit)
{
@@ -563,7 +564,7 @@ static void free_space_set_bits(struct btrfs_block_group *block_group,
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
*size -= end - *start;
*start = end;
@@ -656,7 +657,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
cur_start = start;
cur_size = size;
while (1) {
- free_space_set_bits(block_group, path, &cur_start, &cur_size,
+ free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
!remove);
if (cur_size == 0)
break;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a523d64d5491..318df6f9d9cb 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -139,6 +139,12 @@ enum {
*/
BTRFS_FS_FEATURE_CHANGED,
+ /*
+ * Indicate that we have found a tree block which is only aligned to
+ * sectorsize, but not to nodesize. This should be rare nowadays.
+ */
+ BTRFS_FS_UNALIGNED_TREE_BLOCK,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -171,19 +177,17 @@ enum {
BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
- BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
- BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
- BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
- BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
- BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
- BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
- BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
- BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
- BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
- BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
- BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
- BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
- BTRFS_MOUNT_NODISCARD = (1UL << 31),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 25),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
+ BTRFS_MOUNT_NODISCARD = (1UL << 29),
};
/*
@@ -216,7 +220,8 @@ enum {
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
- BTRFS_FEATURE_INCOMPAT_ZONED)
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -225,6 +230,7 @@ enum {
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
#else
@@ -369,6 +375,7 @@ struct btrfs_fs_info {
struct btrfs_root *uuid_root;
struct btrfs_root *data_reloc_root;
struct btrfs_root *block_group_root;
+ struct btrfs_root *stripe_root;
/* The log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -409,7 +416,17 @@ struct btrfs_fs_info {
struct btrfs_block_rsv empty_block_rsv;
+ /*
+ * Updated while holding the lock 'trans_lock'. Due to the life cycle of
+ * a transaction, it can be directly read while holding a transaction
+ * handle, everywhere else must be read with btrfs_get_fs_generation().
+ * Should always be updated using btrfs_set_fs_generation().
+ */
u64 generation;
+ /*
+ * Always use btrfs_get_last_trans_committed() and
+ * btrfs_set_last_trans_committed() to read and update this field.
+ */
u64 last_trans_committed;
/*
* Generation of the last transaction used for block group relocation
@@ -645,9 +662,6 @@ struct btrfs_fs_info {
struct btrfs_discard_ctl discard_ctl;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- u32 check_integrity_print_mask;
-#endif
/* Is qgroup tracking in a consistent state? */
u64 qgroup_flags;
@@ -683,6 +697,7 @@ struct btrfs_fs_info {
/* Protected by qgroup_rescan_lock */
bool qgroup_rescan_running;
u8 qgroup_drop_subtree_thres;
+ u64 qgroup_enable_gen;
/*
* If this is not 0, then it indicates a serious filesystem error has
@@ -812,6 +827,26 @@ struct btrfs_fs_info {
#endif
};
+static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->generation);
+}
+
+static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen)
+{
+ WRITE_ONCE(fs_info->generation, gen);
+}
+
+static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_trans_committed);
+}
+
+static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen)
+{
+ WRITE_ONCE(fs_info->last_trans_committed, gen);
+}
+
static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
u64 gen)
{
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 4c322b720a80..7d734830e514 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -167,7 +167,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
memmove_extent_buffer(leaf, ptr, ptr + del_len,
item_size - (ptr + del_len - item_start));
- btrfs_truncate_item(path, item_size - del_len, 1);
+ btrfs_truncate_item(trans, path, item_size - del_len, 1);
out:
btrfs_free_path(path);
@@ -229,7 +229,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
- btrfs_truncate_item(path, item_size - sub_item_len, 1);
+ btrfs_truncate_item(trans, path, item_size - sub_item_len, 1);
out:
btrfs_free_path(path);
@@ -247,7 +247,7 @@ out:
}
/*
- * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ * Insert an extended inode ref into a tree.
*
* The caller must have checked against BTRFS_LINK_MAX already.
*/
@@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
name))
goto out;
- btrfs_extend_item(path, ins_len);
+ btrfs_extend_item(trans, path, ins_len);
ret = 0;
}
if (ret < 0)
@@ -299,7 +299,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)&extref->name;
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
@@ -338,7 +338,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
goto out;
old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
- btrfs_extend_item(path, ins_len);
+ btrfs_extend_item(trans, path, ins_len);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -364,7 +364,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
}
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
@@ -591,7 +591,7 @@ search_again:
num_dec = (orig_num_bytes - extent_num_bytes);
if (extent_start != 0)
control->sub_bytes += num_dec;
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
} else {
extent_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
@@ -617,7 +617,7 @@ search_again:
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
size = btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(path, size, 1);
+ btrfs_truncate_item(trans, path, size, 1);
} else if (!del_item) {
/*
* We have to bail so the last_size is set to
@@ -676,7 +676,8 @@ delete:
bytes_deleted += extent_num_bytes;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
- extent_start, extent_num_bytes, 0);
+ extent_start, extent_num_bytes, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
control->ino, extent_offset,
root->root_key.objectid, false);
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index ede43b6c6559..4337bb26f419 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -4,6 +4,7 @@
#define BTRFS_INODE_ITEM_H
#include <linux/types.h>
+#include <linux/crc32c.h>
struct btrfs_trans_handle;
struct btrfs_root;
@@ -12,6 +13,7 @@ struct btrfs_key;
struct btrfs_inode_extref;
struct btrfs_inode;
struct extent_buffer;
+struct fscrypt_str;
/*
* Return this if we need to call truncate_block for the last bit of the
@@ -76,6 +78,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
*ro_flags = (u32)(inode_item_flags >> 32);
}
+/* Figure the key offset of an extended inode ref. */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len)
+{
+ return (u64)crc32c(parent_objectid, name, len);
+}
+
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_truncate_control *control);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f09fbdc43f0f..5e3fccddde0c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@
#include "super.h"
#include "orphan.h"
#include "backref.h"
+#include "raid-stripe-tree.h"
struct btrfs_iget_args {
u64 ino;
@@ -348,7 +349,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
}
/*
- * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ * Lock inode i_rwsem based on arguments passed.
*
* ilock_flags can have the following bit set:
*
@@ -382,7 +383,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
}
/*
- * btrfs_inode_unlock - unock inode i_rwsem
+ * Unock inode i_rwsem.
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
@@ -573,7 +574,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
kunmap_local(kaddr);
put_page(page);
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -670,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
}
btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -1085,9 +1086,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
btrfs_mark_ordered_io_finished(inode, locked_page,
page_start, PAGE_SIZE,
!ret);
- btrfs_page_clear_uptodate(inode->root->fs_info,
- locked_page, page_start,
- PAGE_SIZE);
mapping_set_error(locked_page->mapping, ret);
unlock_page(locked_page);
}
@@ -1568,8 +1566,11 @@ out_unlock:
* Phase two of compressed writeback. This is the ordered portion of the code,
* which only gets called in the order the work was queued. We walk all the
* async extents created by compress_file_range and send them down to the disk.
+ *
+ * If called with @do_free == true then it'll try to finish the work and free
+ * the work struct eventually.
*/
-static noinline void submit_compressed_extents(struct btrfs_work *work)
+static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
{
struct async_chunk *async_chunk = container_of(work, struct async_chunk,
work);
@@ -1578,6 +1579,21 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
unsigned long nr_pages;
u64 alloc_hint = 0;
+ if (do_free) {
+ struct async_chunk *async_chunk;
+ struct async_cow *async_cow;
+
+ async_chunk = container_of(work, struct async_chunk, work);
+ btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
+
+ async_cow = async_chunk->async_cow;
+ if (atomic_dec_and_test(&async_cow->num_chunks))
+ kvfree(async_cow);
+ return;
+ }
+
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
@@ -1594,21 +1610,6 @@ static noinline void submit_compressed_extents(struct btrfs_work *work)
cond_wake_up_nomb(&fs_info->async_submit_wait);
}
-static noinline void async_cow_free(struct btrfs_work *work)
-{
- struct async_chunk *async_chunk;
- struct async_cow *async_cow;
-
- async_chunk = container_of(work, struct async_chunk, work);
- btrfs_add_delayed_iput(async_chunk->inode);
- if (async_chunk->blkcg_css)
- css_put(async_chunk->blkcg_css);
-
- async_cow = async_chunk->async_cow;
- if (atomic_dec_and_test(&async_cow->num_chunks))
- kvfree(async_cow);
-}
-
static bool run_delalloc_compressed(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
u64 end, struct writeback_control *wbc)
@@ -1686,7 +1687,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
}
btrfs_init_work(&async_chunk[i].work, compress_file_range,
- submit_compressed_extents, async_cow_free);
+ submit_compressed_extents);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -2238,8 +2239,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
- test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
- 0, NULL))
+ test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
}
@@ -2791,7 +2791,6 @@ out_page:
mapping_set_error(page->mapping, ret);
btrfs_mark_ordered_io_finished(inode, page, page_start,
PAGE_SIZE, !ret);
- btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE);
clear_page_dirty_for_io(page);
}
btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
@@ -2851,7 +2850,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
ihold(inode);
btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
get_page(page);
- btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -2916,7 +2915,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(struct btrfs_file_extent_item));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
/*
@@ -3074,7 +3073,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
trans->block_rsv = &inode->block_rsv;
- ret = btrfs_update_inode_fallback(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3095,6 +3094,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
+ ret = btrfs_insert_raid_extent(trans, ordered_extent);
+ if (ret)
+ goto out;
+
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@@ -3140,7 +3143,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
&cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
- ret = btrfs_update_inode_fallback(trans, root, inode);
+ ret = btrfs_update_inode_fallback(trans, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3228,7 +3231,8 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
- !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
+ list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
return btrfs_finish_one_ordered(ordered);
}
@@ -3286,7 +3290,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
if (btrfs_is_data_reloc_root(inode->root) &&
test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
- 1, NULL)) {
+ NULL)) {
/* Skip the range without csum for data reloc inode */
clear_extent_bits(&inode->io_tree, file_offset, end,
EXTENT_NODATASUM);
@@ -3310,7 +3314,7 @@ zeroit:
}
/*
- * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ * Perform a delayed iput on @inode.
*
* @inode: The inode we want to perform iput on
*
@@ -3758,19 +3762,17 @@ static int btrfs_read_locked_inode(struct inode *inode,
btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
round_up(i_size_read(inode), fs_info->sectorsize));
- inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
- inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+ inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
- inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
+ inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ btrfs_timespec_nsec(leaf, &inode_item->mtime));
inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime.tv_sec =
- btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime.tv_nsec =
- btrfs_timespec_nsec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3796,7 +3798,7 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in delayed_nodes_tree.
*/
- if (BTRFS_I(inode)->last_trans == fs_info->generation)
+ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -3926,24 +3928,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->otime,
- BTRFS_I(inode)->i_otime.tv_sec);
- btrfs_set_token_timespec_nsec(&token, &item->otime,
- BTRFS_I(inode)->i_otime.tv_nsec);
+ btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
btrfs_set_token_inode_generation(&token, item,
@@ -3961,8 +3961,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* copy everything in the in-memory inode into the btree.
*/
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
@@ -3973,7 +3972,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
+ ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -3985,7 +3984,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item);
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_set_inode_last_trans(trans, inode);
ret = 0;
failed:
@@ -3996,10 +3995,10 @@ failed:
/*
* copy everything in the in-memory inode into the btree.
*/
-noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode)
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -4015,23 +4014,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
- ret = btrfs_delayed_update_inode(trans, root, inode);
+ ret = btrfs_delayed_update_inode(trans, inode);
if (!ret)
btrfs_set_inode_last_trans(trans, inode);
return ret;
}
- return btrfs_update_inode_item(trans, root, inode);
+ return btrfs_update_inode_item(trans, inode);
}
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode)
+ struct btrfs_inode *inode)
{
int ret;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret == -ENOSPC)
- return btrfs_update_inode_item(trans, root, inode);
+ return btrfs_update_inode_item(trans, inode);
return ret;
}
@@ -4136,9 +4135,8 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_ctime_current(&inode->vfs_inode);
- dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
- ret = btrfs_update_inode(trans, root, dir);
+ inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ ret = btrfs_update_inode(trans, dir);
out:
return ret;
}
@@ -4152,7 +4150,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, inode->root, inode);
+ ret = btrfs_update_inode(trans, inode);
}
return ret;
}
@@ -4310,8 +4308,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
inode_inc_iversion(&dir->vfs_inode);
- dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
- ret = btrfs_update_inode_fallback(trans, root, dir);
+ inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
+ ret = btrfs_update_inode_fallback(trans, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -4645,7 +4643,8 @@ out_notrans:
}
/*
- * btrfs_truncate_block - read, zero a chunk and write a block
+ * Read, zero a chunk and write a block.
+ *
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
* @len - the length to zero, 0 to zero the entire range respective to the
@@ -4795,9 +4794,9 @@ out:
return ret;
}
-static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
- u64 offset, u64 len)
+static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_drop_extents_args drop_args = { 0 };
@@ -4837,7 +4836,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
btrfs_abort_transaction(trans, ret);
} else {
btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, inode);
}
btrfs_end_transaction(trans);
return ret;
@@ -4893,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
- err = maybe_insert_hole(root, inode, cur_offset,
- hole_size);
+ err = maybe_insert_hole(inode, cur_offset, hole_size);
if (err)
break;
@@ -4920,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
- hole_em->generation = fs_info->generation;
+ hole_em->generation = btrfs_get_fs_generation(fs_info);
err = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
@@ -4960,7 +4958,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (newsize != oldsize) {
inode_inc_iversion(inode);
if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
}
}
@@ -4988,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
i_size_write(inode, newsize);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
pagecache_isize_extended(inode, oldsize, newsize);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
@@ -5586,6 +5585,7 @@ static struct inode *new_simple_dir(struct inode *dir,
struct btrfs_key *key,
struct btrfs_root *root)
{
+ struct timespec64 ts;
struct inode *inode = new_inode(dir->i_sb);
if (!inode)
@@ -5604,9 +5604,13 @@ static struct inode *new_simple_dir(struct inode *dir,
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = inode_set_ctime_current(inode);
- inode->i_atime = dir->i_atime;
- BTRFS_I(inode)->i_otime = inode->i_mtime;
+
+ ts = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, ts);
+ inode_set_atime_to_ts(inode, inode_get_atime(dir));
+ BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+ BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+
inode->i_uid = dir->i_uid;
inode->i_gid = dir->i_gid;
@@ -5769,20 +5773,24 @@ out:
static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
{
- if (dir->index_cnt == (u64)-1) {
- int ret;
+ int ret = 0;
+ btrfs_inode_lock(dir, 0);
+ if (dir->index_cnt == (u64)-1) {
ret = btrfs_inode_delayed_dir_index_count(dir);
if (ret) {
ret = btrfs_set_inode_index_count(dir);
if (ret)
- return ret;
+ goto out;
}
}
- *index = dir->index_cnt;
+ /* index_cnt is the index number of next new entry, so decrement it. */
+ *index = dir->index_cnt - 1;
+out:
+ btrfs_inode_unlock(dir, 0);
- return 0;
+ return ret;
}
/*
@@ -5817,6 +5825,19 @@ static int btrfs_opendir(struct inode *inode, struct file *file)
return 0;
}
+static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct btrfs_file_private *private = file->private_data;
+ int ret;
+
+ ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
+ &private->last_index);
+ if (ret)
+ return ret;
+
+ return generic_file_llseek(file, offset, whence);
+}
+
struct dir_entry {
u64 ino;
u64 offset;
@@ -5987,15 +6008,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
+ ret = btrfs_update_inode(trans, inode);
+ if (ret == -ENOSPC || ret == -EDQUOT) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
}
btrfs_end_transaction(trans);
if (inode->delayed_node)
@@ -6011,7 +6032,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
static int btrfs_update_time(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- bool dirty = flags & ~S_VERSION;
+ bool dirty;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -6147,6 +6168,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
{
+ struct timespec64 ts;
struct inode *dir = args->dir;
struct inode *inode = args->inode;
const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
@@ -6264,9 +6286,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
goto discard;
}
- inode->i_mtime = inode_set_ctime_current(inode);
- inode->i_atime = inode->i_mtime;
- BTRFS_I(inode)->i_otime = inode->i_mtime;
+ ts = simple_inode_init_ts(inode);
+ BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
+ BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
/*
* We're going to fill the inode item now, so at this point the inode
@@ -6297,7 +6319,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
/*
* We don't need the path anymore, plus inheriting properties, adding
* ACLs, security xattrs, orphan item or adding the link, will result in
@@ -6431,10 +6453,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
* values (the ones it had when the fsync was done).
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- parent_inode->vfs_inode.i_mtime =
- inode_set_ctime_current(&parent_inode->vfs_inode);
+ inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+ inode_set_ctime_current(&parent_inode->vfs_inode));
- ret = btrfs_update_inode(trans, root, parent_inode);
+ ret = btrfs_update_inode(trans, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
return ret;
@@ -6585,7 +6607,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
} else {
struct dentry *parent = dentry->d_parent;
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ err = btrfs_update_inode(trans, BTRFS_I(inode));
if (err)
goto fail;
if (inode->i_nlink == 1) {
@@ -7090,8 +7112,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
range_end = round_up(offset + nocow_args.num_bytes,
root->fs_info->sectorsize) - 1;
- ret = test_range_bit(io_tree, offset, range_end,
- EXTENT_DELALLOC, 0, NULL);
+ ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
if (ret) {
ret = -EAGAIN;
goto out;
@@ -7992,11 +8013,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, &cached_state);
- spin_lock_irq(&inode->ordered_tree.lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
ordered->truncated_len = min(ordered->truncated_len,
cur - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree.lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
/*
* If the ordered extent has finished, we're safe to delete all
@@ -8326,7 +8347,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
break;
@@ -8379,7 +8400,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_update_inode(trans, inode);
if (ret2 && !ret)
ret = ret2;
@@ -8468,8 +8489,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->delayed_node = NULL;
- ei->i_otime.tv_sec = 0;
- ei->i_otime.tv_nsec = 0;
+ ei->i_otime_sec = 0;
+ ei->i_otime_nsec = 0;
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
@@ -8478,7 +8499,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(fs_info, &ei->file_extent_tree,
IO_TREE_INODE_FILE_EXTENT);
mutex_init(&ei->log_mutex);
- btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+ spin_lock_init(&ei->ordered_tree_lock);
+ ei->ordered_tree = RB_ROOT;
+ ei->ordered_tree_last = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
@@ -8621,8 +8644,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
- stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
- stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
+ stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
+ stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
if (bi_flags & BTRFS_INODE_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
if (bi_flags & BTRFS_INODE_COMPRESS)
@@ -8810,7 +8833,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(old_dentry->d_inode),
old_name, &old_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -8825,7 +8848,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_dentry->d_inode),
new_name, &new_rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9070,7 +9093,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
BTRFS_I(d_inode(old_dentry)),
&old_fname.disk_name, &rename_ctx);
if (!ret)
- ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9195,7 +9218,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
return work;
}
@@ -9433,7 +9456,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
ptr = btrfs_file_extent_inline_start(ei);
write_extent_buffer(leaf, symname, ptr, name_len);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
@@ -9626,7 +9649,7 @@ next:
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -10868,7 +10891,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
};
static const struct file_operations btrfs_dir_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = btrfs_dir_llseek,
.read = generic_read_dir,
.iterate_shared = btrfs_real_readdir,
.open = btrfs_opendir,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a18ee7b5a166..752acff2c734 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -385,7 +385,7 @@ update_flags:
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
out_end_trans:
btrfs_end_transaction(trans);
@@ -652,18 +652,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
/* Tree log can't currently deal with an inode which is a new root. */
btrfs_set_log_full_commit(trans);
- ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
+ ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit);
if (ret)
goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
- BTRFS_NESTING_NORMAL);
+ 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto out;
}
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
@@ -1958,6 +1958,13 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
goto out_put;
}
+ /*
+ * We don't need the path anymore, so release it and
+ * avoid deadlocks and lockdep warnings in case
+ * btrfs_iget() needs to lookup the inode from its root
+ * btree and lock the same leaf.
+ */
+ btrfs_release_path(path);
temp_inode = btrfs_iget(sb, key2.objectid, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
@@ -1978,7 +1985,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
goto out_put;
}
- btrfs_release_path(path);
key.objectid = key.offset;
key.offset = (u64)-1;
dirid = key.objectid;
@@ -2629,6 +2635,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
return -EINVAL;
}
+ if (fs_info->fs_devices->temp_fsid) {
+ btrfs_err(fs_info,
+ "device add not supported on cloned temp-fsid mount");
+ return -EINVAL;
+ }
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -2670,8 +2682,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
- struct block_device *bdev = NULL;
- void *holder;
+ struct bdev_handle *bdev_handle = NULL;
int ret;
bool cancel = false;
@@ -2708,7 +2719,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
/* Exclusive operation is now claimed */
- ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
btrfs_exclop_finish(fs_info);
@@ -2722,8 +2733,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
}
err_drop:
mnt_drop_write_file(file);
- if (bdev)
- blkdev_put(bdev, holder);
+ if (bdev_handle)
+ bdev_release(bdev_handle);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2736,8 +2747,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
- struct block_device *bdev = NULL;
- void *holder;
+ struct bdev_handle *bdev_handle = NULL;
int ret;
bool cancel = false;
@@ -2764,15 +2774,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
cancel);
if (ret == 0) {
- ret = btrfs_rm_device(fs_info, &args, &bdev, &holder);
+ ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
btrfs_exclop_finish(fs_info);
}
mnt_drop_write_file(file);
- if (bdev)
- blkdev_put(bdev, holder);
+ if (bdev_handle)
+ bdev_release(bdev_handle);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2816,7 +2826,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
}
if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
- fi_args->generation = fs_info->generation;
+ fi_args->generation = btrfs_get_fs_generation(fs_info);
fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
}
@@ -2941,7 +2951,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
@@ -2972,7 +2982,7 @@ static void get_block_group_info(struct list_head *groups_list,
static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_space_args space_args;
+ struct btrfs_ioctl_space_args space_args = { 0 };
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
struct btrfs_ioctl_space_info *dest_orig;
@@ -3125,7 +3135,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
return PTR_ERR(trans);
/* No running transaction, don't bother */
- transid = root->fs_info->last_trans_committed;
+ transid = btrfs_get_last_trans_committed(root->fs_info);
goto out;
}
transid = trans->transid;
@@ -3691,7 +3701,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
switch (sa->cmd) {
case BTRFS_QUOTA_CTL_ENABLE:
- ret = btrfs_quota_enable(fs_info);
+ case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
+ ret = btrfs_quota_enable(fs_info, sa);
break;
case BTRFS_QUOTA_CTL_DISABLE:
ret = btrfs_quota_disable(fs_info);
@@ -4332,7 +4343,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat
if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
- struct btrfs_ioctl_send_args_32 args32;
+ struct btrfs_ioctl_send_args_32 args32 = { 0 };
ret = copy_from_user(&args32, argp, sizeof(args32));
if (ret)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 7979449a58d6..74d8e2003f58 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -8,6 +8,7 @@
#include <linux/spinlock.h>
#include <linux/page-flags.h>
#include <asm/bug.h>
+#include <trace/events/btrfs.h>
#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
@@ -73,6 +74,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
+ { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
{ .id = 0, DEFINE_NAME("tree") },
};
@@ -102,6 +104,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff
#endif
+#ifdef CONFIG_BTRFS_DEBUG
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
+{
+ eb->lock_owner = owner;
+}
+#else
+static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
+#endif
+
/*
* Extent buffer locking
* =====================
@@ -164,7 +175,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
if (down_write_trylock(&eb->lock)) {
- eb->lock_owner = current->pid;
+ btrfs_set_eb_lock_owner(eb, current->pid);
trace_btrfs_try_tree_write_lock(eb);
return 1;
}
@@ -181,7 +192,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
}
/*
- * __btrfs_tree_lock - lock eb for write
+ * Lock eb for write.
+ *
* @eb: the eb to lock
* @nest: the nesting to use for the lock
*
@@ -196,7 +208,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
start_ns = ktime_get_ns();
down_write_nested(&eb->lock, nest);
- eb->lock_owner = current->pid;
+ btrfs_set_eb_lock_owner(eb, current->pid);
trace_btrfs_tree_lock(eb, start_ns);
}
@@ -211,7 +223,7 @@ void btrfs_tree_lock(struct extent_buffer *eb)
void btrfs_tree_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_unlock(eb);
- eb->lock_owner = 0;
+ btrfs_set_eb_lock_owner(eb, 0);
up_write(&eb->lock);
}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index edb9b4a0dba1..7d6ee1e609bf 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -79,7 +79,7 @@ enum btrfs_lock_nesting {
};
enum btrfs_lockdep_trans_states {
- BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_COMMIT_PREP,
BTRFS_LOCKDEP_TRANS_UNBLOCKED,
BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
BTRFS_LOCKDEP_TRANS_COMPLETED,
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 7695decc7243..b8f9c9e56c8c 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -72,11 +72,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
* over the error. Each subsequent error that doesn't have any context
* of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
*/
-const char * __attribute_const__ btrfs_decode_error(int errno)
+const char * __attribute_const__ btrfs_decode_error(int error)
{
char *errstr = "unknown";
- switch (errno) {
+ switch (error) {
case -ENOENT: /* -2 */
errstr = "No such entry";
break;
@@ -110,12 +110,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno)
}
/*
- * __btrfs_handle_fs_error decodes expected errors from the caller and
- * invokes the appropriate error response.
+ * Decodes expected errors from the caller and invokes the appropriate error
+ * response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
+ unsigned int line, int error, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
@@ -132,11 +132,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* Special case: if the error is EROFS, and we're already under
* SB_RDONLY, then it is safe here.
*/
- if (errno == -EROFS && sb_rdonly(sb))
+ if (error == -EROFS && sb_rdonly(sb))
return;
#ifdef CONFIG_PRINTK
- errstr = btrfs_decode_error(errno);
+ errstr = btrfs_decode_error(error);
btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
@@ -147,11 +147,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
vaf.va = &args;
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, statestr, function, line, errno, errstr, &vaf);
+ sb->s_id, statestr, function, line, error, errstr, &vaf);
va_end(args);
} else {
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
- sb->s_id, statestr, function, line, errno, errstr);
+ sb->s_id, statestr, function, line, error, errstr);
}
#endif
@@ -159,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
* Today we only save the error info to memory. Long term we'll also
* send it down to the disk.
*/
- WRITE_ONCE(fs_info->fs_error, errno);
+ WRITE_ONCE(fs_info->fs_error, error);
/* Don't go through full error handling during mount. */
if (!(sb->s_flags & SB_BORN))
@@ -283,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
#endif
/*
- * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
- * alert, and either panics or BUGs, depending on mount options.
+ * Decode unexpected, fatal errors from the caller, issue an alert, and either
+ * panic or BUGs, depending on mount options.
*/
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
+ unsigned int line, int error, const char *fmt, ...)
{
char *s_id = "<unknown>";
const char *errstr;
@@ -301,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
va_start(args, fmt);
vaf.va = &args;
- errstr = btrfs_decode_error(errno);
+ errstr = btrfs_decode_error(error);
if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
- s_id, function, line, &vaf, errno, errstr);
+ s_id, function, line, &vaf, error, errstr);
btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
- function, line, &vaf, errno, errstr);
+ function, line, &vaf, error, errstr);
va_end(args);
/* Caller calls BUG() */
}
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 1ae6f8e23e07..4d04c1fa5899 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -184,25 +184,25 @@ do { \
__printf(5, 6)
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
+ unsigned int line, int error, const char *fmt, ...);
-const char * __attribute_const__ btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int error);
-#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+#define btrfs_handle_fs_error(fs_info, error, fmt, args...) \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args)
+ (error), fmt, ##args)
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...);
+ unsigned int line, int error, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
* will panic(). Otherwise we BUG() here.
*/
-#define btrfs_panic(fs_info, errno, fmt, args...) \
+#define btrfs_panic(fs_info, error, fmt, args...) \
do { \
- __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ __btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args); \
BUG(); \
} while (0)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b46ab348e8e5..574e8a55e24a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
*/
-static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
- u64 file_offset)
+static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
+ u64 file_offset)
{
- struct rb_root *root = &tree->tree;
struct rb_node *prev = NULL;
struct rb_node *ret;
struct btrfs_ordered_extent *entry;
- if (tree->last) {
- entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+ if (inode->ordered_tree_last) {
+ entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
rb_node);
if (in_range(file_offset, entry->file_offset, entry->num_bytes))
- return tree->last;
+ return inode->ordered_tree_last;
}
- ret = __tree_search(root, file_offset, &prev);
+ ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
if (!ret)
ret = prev;
if (ret)
- tree->last = ret;
+ inode->ordered_tree_last = ret;
return ret;
}
@@ -191,6 +190,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
+ INIT_LIST_HEAD(&entry->bioc_list);
init_completion(&entry->completion);
/*
@@ -208,7 +208,6 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
{
struct btrfs_inode *inode = BTRFS_I(entry->inode);
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -221,13 +220,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
/* One ref for the tree. */
refcount_inc(&entry->refs);
- spin_lock_irq(&tree->lock);
- node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = tree_insert(&inode->ordered_tree, entry->file_offset,
+ &entry->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
@@ -287,12 +287,11 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
- tree = &BTRFS_I(entry->inode)->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
static void finish_ordered_fn(struct btrfs_work *work)
@@ -310,7 +309,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct btrfs_inode *inode = BTRFS_I(ordered->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- lockdep_assert_held(&inode->ordered_tree.lock);
+ lockdep_assert_held(&inode->ordered_tree_lock);
if (page) {
ASSERT(page->mapping);
@@ -364,7 +363,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
fs_info->endio_freespace_worker : fs_info->endio_write_workers;
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
btrfs_queue_work(wq, &ordered->work);
}
@@ -378,9 +377,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
- spin_lock_irqsave(&inode->ordered_tree.lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
- spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
if (ret)
btrfs_queue_ordered_fn(ordered);
@@ -404,7 +403,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
u64 num_bytes, bool uptodate)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
@@ -414,13 +412,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
file_offset + num_bytes - 1,
uptodate);
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
while (cur < file_offset + num_bytes) {
u64 entry_end;
u64 end;
u32 len;
- node = tree_search(tree, cur);
+ node = ordered_tree_search(inode, cur);
/* No ordered extents at all */
if (!node)
break;
@@ -467,13 +465,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
len = end + 1 - cur;
if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
btrfs_queue_ordered_fn(entry);
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
}
cur += len;
}
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
}
/*
@@ -497,19 +495,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
bool finished = false;
- spin_lock_irqsave(&tree->lock, flags);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
if (cached && *cached) {
entry = *cached;
goto have_entry;
}
- node = tree_search(tree, file_offset);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -540,7 +537,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
return finished;
}
@@ -578,7 +575,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = btrfs_inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -609,16 +605,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
- tree = &btrfs_inode->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&btrfs_inode->ordered_tree_lock);
node = &entry->rb_node;
- rb_erase(node, &tree->tree);
+ rb_erase(node, &btrfs_inode->ordered_tree);
RB_CLEAR_NODE(node);
- if (tree->last == node)
- tree->last = NULL;
+ if (btrfs_inode->ordered_tree_last == node)
+ btrfs_inode->ordered_tree_last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&btrfs_inode->ordered_tree_lock);
/*
* The current running transaction is waiting on us, we need to let it
@@ -639,7 +634,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
- ASSERT(trans);
+ ASSERT(trans || BTRFS_FS_ERROR(fs_info));
if (trans) {
if (atomic_dec_and_test(&trans->pending_ordered))
wake_up(&trans->pending_wait);
@@ -711,7 +706,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
- btrfs_run_ordered_extent_work, NULL, NULL);
+ btrfs_run_ordered_extent_work, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
@@ -875,14 +870,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
- tree = &inode->ordered_tree;
- spin_lock_irqsave(&tree->lock, flags);
- node = tree_search(tree, file_offset);
+ spin_lock_irqsave(&inode->ordered_tree_lock, flags);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -894,7 +887,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
trace_btrfs_ordered_extent_lookup(inode, entry);
}
out:
- spin_unlock_irqrestore(&tree->lock, flags);
+ spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
return entry;
}
@@ -904,15 +897,13 @@ out:
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
- node = tree_search(tree, file_offset);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = ordered_tree_search(inode, file_offset);
if (!node) {
- node = tree_search(tree, file_offset + len);
+ node = ordered_tree_search(inode, file_offset + len);
if (!node)
goto out;
}
@@ -936,7 +927,7 @@ out:
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_range(inode, entry);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -947,13 +938,12 @@ out:
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *n;
ASSERT(inode_is_locked(&inode->vfs_inode));
- spin_lock_irq(&tree->lock);
- for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ spin_lock_irq(&inode->ordered_tree_lock);
+ for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
struct btrfs_ordered_extent *ordered;
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
@@ -966,7 +956,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
refcount_inc(&ordered->refs);
trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
/*
@@ -976,13 +966,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
- struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
- node = tree_search(tree, file_offset);
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = ordered_tree_search(inode, file_offset);
if (!node)
goto out;
@@ -990,7 +978,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
refcount_inc(&entry->refs);
trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -1006,15 +994,14 @@ out:
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct rb_node *cur;
struct rb_node *prev;
struct rb_node *next;
struct btrfs_ordered_extent *entry = NULL;
- spin_lock_irq(&tree->lock);
- node = tree->tree.rb_node;
+ spin_lock_irq(&inode->ordered_tree_lock);
+ node = inode->ordered_tree.rb_node;
/*
* Here we don't want to use tree_search() which will use tree->last
* and screw up the search order.
@@ -1068,7 +1055,7 @@ out:
trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
}
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
return entry;
}
@@ -1147,7 +1134,6 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len)
{
struct btrfs_inode *inode = BTRFS_I(ordered->inode);
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 file_offset = ordered->file_offset;
@@ -1187,13 +1173,13 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
refcount_inc(&new->refs);
spin_lock_irq(&root->ordered_extent_lock);
- spin_lock(&tree->lock);
+ spin_lock(&inode->ordered_tree_lock);
/* Remove from tree once */
node = &ordered->rb_node;
- rb_erase(node, &tree->tree);
+ rb_erase(node, &inode->ordered_tree);
RB_CLEAR_NODE(node);
- if (tree->last == node)
- tree->last = NULL;
+ if (inode->ordered_tree_last == node)
+ inode->ordered_tree_last = NULL;
ordered->file_offset += len;
ordered->disk_bytenr += len;
@@ -1224,18 +1210,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
}
/* Re-insert the node */
- node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+ node = tree_insert(&inode->ordered_tree, ordered->file_offset,
+ &ordered->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"zoned: inconsistency in ordered tree at offset %llu",
ordered->file_offset);
- node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
+ node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"zoned: inconsistency in ordered tree at offset %llu",
new->file_offset);
- spin_unlock(&tree->lock);
+ spin_unlock(&inode->ordered_tree_lock);
list_add_tail(&new->root_extent_list, &root->ordered_extents);
root->nr_ordered_extents++;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 173bd5c5df26..567a6d3d4712 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -6,13 +6,6 @@
#ifndef BTRFS_ORDERED_DATA_H
#define BTRFS_ORDERED_DATA_H
-/* one of these per inode */
-struct btrfs_ordered_inode_tree {
- spinlock_t lock;
- struct rb_root tree;
- struct rb_node *last;
-};
-
struct btrfs_ordered_sum {
/*
* Logical start address and length for of the blocks covered by
@@ -151,15 +144,9 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
-};
-static inline void
-btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
-{
- spin_lock_init(&t->lock);
- t->tree = RB_ROOT;
- t->last = NULL;
-}
+ struct list_head bioc_list;
+};
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0c93439e929f..7e46aa8a0444 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -9,6 +9,8 @@
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
+#include "volumes.h"
+#include "raid-stripe-tree.h"
struct root_name_map {
u64 id;
@@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = {
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
{ BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+ { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" },
};
const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
@@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
+static void print_extent_owner_ref(const struct extent_buffer *eb,
+ const struct btrfs_extent_owner_ref *ref)
+{
+ ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA));
+ pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref));
+}
+
static void print_extent_item(const struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
+ struct btrfs_extent_owner_ref *oref;
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
@@ -161,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
"\t\t\t(parent %llu not aligned to sectorsize %u)\n",
offset, eb->fs_info->sectorsize);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
+ print_extent_owner_ref(eb, oref);
+ break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
eb->start, type);
@@ -189,6 +204,22 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
}
}
+static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size,
+ struct btrfs_stripe_extent *stripe)
+{
+ const int num_stripes = btrfs_num_raid_stripes(item_size);
+ const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe);
+
+ pr_info("\t\t\tencoding: %s\n",
+ (encoding && encoding < BTRFS_NR_RAID_TYPES) ?
+ btrfs_raid_array[encoding].raid_name : "unknown");
+
+ for (int i = 0; i < num_stripes; i++)
+ pr_info("\t\t\tstride %d devid %llu physical %llu\n",
+ i, btrfs_raid_stride_devid(eb, &stripe->strides[i]),
+ btrfs_raid_stride_physical(eb, &stripe->strides[i]));
+}
+
/*
* Helper to output refs and locking status of extent buffer. Useful to debug
* race condition related problems.
@@ -349,6 +380,10 @@ void btrfs_print_leaf(const struct extent_buffer *l)
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
btrfs_item_size(l, i));
break;
+ case BTRFS_RAID_STRIPE_KEY:
+ print_raid_stripe_key(l, btrfs_item_size(l, i),
+ btrfs_item_ptr(l, i, struct btrfs_stripe_extent));
+ break;
}
}
}
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 0755af0e53e3..f9bf591a0718 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -15,6 +15,7 @@
#include "fs.h"
#include "accessors.h"
#include "super.h"
+#include "dir-item.h"
#define BTRFS_PROP_HANDLERS_HT_BITS 8
static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b99230db3c82..edb84cc03237 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,6 +30,25 @@
#include "root-tree.h"
#include "tree-checker.h"
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
+{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return BTRFS_QGROUP_MODE_DISABLED;
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
+ return BTRFS_QGROUP_MODE_SIMPLE;
+ return BTRFS_QGROUP_MODE_FULL;
+}
+
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
+}
+
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
+{
+ return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
+}
+
/*
* Helpers to access qgroup reservation
*
@@ -146,16 +165,6 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
-static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
-{
- return (u64)(uintptr_t)qg;
-}
-
-static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
-{
- return (struct btrfs_qgroup *)(uintptr_t)n->aux;
-}
-
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
@@ -180,34 +189,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
return NULL;
}
-/* must be called with qgroup_lock held */
+/*
+ * Add qgroup to the filesystem's qgroup tree.
+ *
+ * Must be called with qgroup_lock held and @prealloc preallocated.
+ *
+ * The control on the lifespan of @prealloc would be transfered to this
+ * function, thus caller should no longer touch @prealloc.
+ */
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *prealloc,
u64 qgroupid)
{
struct rb_node **p = &fs_info->qgroup_tree.rb_node;
struct rb_node *parent = NULL;
struct btrfs_qgroup *qgroup;
+ /* Caller must have pre-allocated @prealloc. */
+ ASSERT(prealloc);
+
while (*p) {
parent = *p;
qgroup = rb_entry(parent, struct btrfs_qgroup, node);
- if (qgroup->qgroupid < qgroupid)
+ if (qgroup->qgroupid < qgroupid) {
p = &(*p)->rb_left;
- else if (qgroup->qgroupid > qgroupid)
+ } else if (qgroup->qgroupid > qgroupid) {
p = &(*p)->rb_right;
- else
+ } else {
+ kfree(prealloc);
return qgroup;
+ }
}
- qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
- if (!qgroup)
- return ERR_PTR(-ENOMEM);
-
+ qgroup = prealloc;
qgroup->qgroupid = qgroupid;
INIT_LIST_HEAD(&qgroup->groups);
INIT_LIST_HEAD(&qgroup->members);
INIT_LIST_HEAD(&qgroup->dirty);
+ INIT_LIST_HEAD(&qgroup->iterator);
+ INIT_LIST_HEAD(&qgroup->nested_iterator);
rb_link_node(&qgroup->node, parent, p);
rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
@@ -254,27 +275,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
/*
* Add relation specified by two qgroups.
*
- * Must be called with qgroup_lock held.
+ * Must be called with qgroup_lock held, the ownership of @prealloc is
+ * transferred to this function and caller should not touch it anymore.
*
* Return: 0 on success
* -ENOENT if one of the qgroups is NULL
* <0 other errors
*/
-static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
+static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
+ struct btrfs_qgroup *member,
+ struct btrfs_qgroup *parent)
{
- struct btrfs_qgroup_list *list;
-
- if (!member || !parent)
+ if (!member || !parent) {
+ kfree(prealloc);
return -ENOENT;
+ }
- list = kzalloc(sizeof(*list), GFP_ATOMIC);
- if (!list)
- return -ENOMEM;
-
- list->group = parent;
- list->member = member;
- list_add_tail(&list->next_group, &member->groups);
- list_add_tail(&list->next_member, &parent->members);
+ prealloc->group = parent;
+ prealloc->member = member;
+ list_add_tail(&prealloc->next_group, &member->groups);
+ list_add_tail(&prealloc->next_member, &parent->members);
return 0;
}
@@ -288,7 +308,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
* -ENOENT if one of the ids does not exist
* <0 other errors
*/
-static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+static int add_relation_rb(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_list *prealloc,
+ u64 memberid, u64 parentid)
{
struct btrfs_qgroup *member;
struct btrfs_qgroup *parent;
@@ -296,7 +318,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare
member = find_qgroup_rb(fs_info, memberid);
parent = find_qgroup_rb(fs_info, parentid);
- return __add_relation_rb(member, parent);
+ return __add_relation_rb(prealloc, member, parent);
}
/* Must be called with qgroup_lock held */
@@ -340,11 +362,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
{
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return;
fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
}
+static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, int slot,
+ struct btrfs_qgroup_status_item *ptr)
+{
+ ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
+ fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
+}
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -361,7 +394,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
u64 flags = 0;
u64 rescan_progress = 0;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!fs_info->quota_root)
return 0;
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
@@ -411,14 +444,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
"old qgroup version, quota disabled");
goto out;
}
- if (btrfs_qgroup_status_generation(l, ptr) !=
- fs_info->generation) {
+ fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+ qgroup_read_enable_gen(fs_info, l, slot, ptr);
+ } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
qgroup_mark_inconsistent(fs_info);
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
- fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
- ptr);
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -434,11 +467,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup_mark_inconsistent(fs_info);
}
if (!qgroup) {
- qgroup = add_qgroup_rb(fs_info, found_key.offset);
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
+ struct btrfs_qgroup *prealloc;
+
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ if (!prealloc) {
+ ret = -ENOMEM;
goto out;
}
+ qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0)
@@ -489,6 +525,8 @@ next1:
if (ret)
goto out;
while (1) {
+ struct btrfs_qgroup_list *list = NULL;
+
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
@@ -502,8 +540,14 @@ next1:
goto next2;
}
- ret = add_relation_rb(fs_info, found_key.objectid,
+ list = kzalloc(sizeof(*list), GFP_KERNEL);
+ if (!list) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = add_relation_rb(fs_info, list, found_key.objectid,
found_key.offset);
+ list = NULL;
if (ret == -ENOENT) {
btrfs_warn(fs_info,
"orphan qgroup relation 0x%llx->0x%llx",
@@ -522,13 +566,12 @@ next2:
out:
btrfs_free_path(path);
fs_info->qgroup_flags |= flags;
- if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
- ret >= 0)
- ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
-
- if (ret < 0) {
+ if (ret >= 0) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+ } else {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -550,7 +593,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
struct rb_node *node;
bool ret = false;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
return ret;
/*
* Since we're unmounting, there is no race and no need to grab qgroup
@@ -622,7 +665,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_free_path(path);
return ret;
@@ -700,7 +743,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -719,7 +762,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -808,7 +851,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, l);
out:
btrfs_free_path(path);
@@ -854,7 +897,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, l);
out:
btrfs_free_path(path);
@@ -896,7 +939,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
- btrfs_mark_buffer_dirty(l);
+ btrfs_mark_buffer_dirty(trans, l);
out:
btrfs_free_path(path);
@@ -949,7 +992,8 @@ out:
return ret;
}
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
{
struct btrfs_root *quota_root;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -959,8 +1003,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
+ struct btrfs_qgroup *prealloc = NULL;
struct btrfs_trans_handle *trans = NULL;
struct ulist *ulist = NULL;
+ const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
int ret = 0;
int slot;
@@ -1063,13 +1109,18 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_qgroup_status_item);
btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
- fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
+ if (simple) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
+ btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
+ } else {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ }
btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
key.objectid = 0;
key.type = BTRFS_ROOT_REF_KEY;
@@ -1094,6 +1145,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
/* Release locks on tree_root before we access quota_root */
btrfs_release_path(path);
+ /* We should not have a stray @prealloc pointer. */
+ ASSERT(prealloc == NULL);
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
+ goto out_free_path;
+ }
+
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
@@ -1101,7 +1161,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
goto out_free_path;
}
- qgroup = add_qgroup_rb(fs_info, found_key.offset);
+ qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
+ prealloc = NULL;
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
btrfs_abort_transaction(trans, ret);
@@ -1144,18 +1205,22 @@ out_add_root:
goto out_free_path;
}
- qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- btrfs_abort_transaction(trans, ret);
+ ASSERT(prealloc == NULL);
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
goto out_free_path;
}
+ qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
+ prealloc = NULL;
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
+ fs_info->qgroup_enable_gen = trans->transid;
+
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* Commit the transaction while not holding qgroup_ioctl_lock, to avoid
@@ -1180,8 +1245,14 @@ out_add_root:
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ if (simple)
+ btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
spin_unlock(&fs_info->qgroup_lock);
+ /* Skip rescan for simple qgroups. */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ goto out_free_path;
+
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
@@ -1222,6 +1293,39 @@ out:
else if (trans)
ret = btrfs_end_transaction(trans);
ulist_free(ulist);
+ kfree(prealloc);
+ return ret;
+}
+
+/*
+ * It is possible to have outstanding ordered extents which reserved bytes
+ * before we disabled. We need to fully flush delalloc, ordered extents, and a
+ * commit to ensure that we don't leak such reservations, only to have them
+ * come back if we re-enable.
+ *
+ * - enable simple quotas
+ * - reserve space
+ * - release it, store rsv_bytes in OE
+ * - disable quotas
+ * - enable simple quotas (qgroup rsv are all 0)
+ * - OE finishes
+ * - run delayed refs
+ * - free rsv_bytes, resulting in miscounting or even underflow
+ */
+static int flush_reservations(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
+ if (ret)
+ return ret;
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ btrfs_commit_transaction(trans);
+
return ret;
}
@@ -1269,6 +1373,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
+ ret = flush_reservations(fs_info);
+ if (ret)
+ goto out_unlock_cleaner;
+
/*
* 1 For the root item
*
@@ -1295,6 +1403,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
spin_unlock(&fs_info->qgroup_lock);
@@ -1329,7 +1438,8 @@ out:
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
- ret = btrfs_end_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
+out_unlock_cleaner:
mutex_unlock(&fs_info->cleaner_mutex);
return ret;
@@ -1342,6 +1452,24 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
+static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+ if (!list_empty(&qgroup->iterator))
+ return;
+
+ list_add_tail(&qgroup->iterator, head);
+}
+
+static void qgroup_iterator_clean(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
+ list_del_init(&qgroup->iterator);
+ }
+}
+
/*
* The easy accounting, we're updating qgroup relationship whose child qgroup
* only has exclusive extents.
@@ -1356,14 +1484,12 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
*
* Caller should hold fs_info->qgroup_lock.
*/
-static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
- struct ulist *tmp, u64 ref_root,
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup_list *glist;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ struct btrfs_qgroup *cur;
+ LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
int ret = 0;
@@ -1371,53 +1497,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
if (!qgroup)
goto out;
- qgroup->rfer += sign * num_bytes;
- qgroup->rfer_cmpr += sign * num_bytes;
-
- WARN_ON(sign < 0 && qgroup->excl < num_bytes);
- qgroup->excl += sign * num_bytes;
- qgroup->excl_cmpr += sign * num_bytes;
-
- if (sign > 0)
- qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
- else
- qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
-
- qgroup_dirty(fs_info, qgroup);
-
- /* Get all of the parent groups that contain this qgroup */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(cur, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
- /* Iterate all of the parents and adjust their reference counts */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- qgroup = unode_aux_to_qgroup(unode);
qgroup->rfer += sign * num_bytes;
qgroup->rfer_cmpr += sign * num_bytes;
+
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
else
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
- qgroup->excl_cmpr += sign * num_bytes;
qgroup_dirty(fs_info, qgroup);
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ /* Append parent qgroups to @qgroup_list. */
+ list_for_each_entry(glist, &qgroup->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
ret = 0;
out:
+ qgroup_iterator_clean(&qgroup_list);
return ret;
}
@@ -1434,8 +1537,7 @@ out:
* Return < 0 for other error.
*/
static int quick_update_accounting(struct btrfs_fs_info *fs_info,
- struct ulist *tmp, u64 src, u64 dst,
- int sign)
+ u64 src, u64 dst, int sign)
{
struct btrfs_qgroup *qgroup;
int ret = 1;
@@ -1446,8 +1548,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
goto out;
if (qgroup->excl == qgroup->rfer) {
ret = 0;
- err = __qgroup_excl_accounting(fs_info, tmp, dst,
- qgroup, sign);
+ err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
if (err < 0) {
ret = err;
goto out;
@@ -1459,28 +1560,19 @@ out:
return ret;
}
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
- u64 dst)
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct ulist *tmp;
- unsigned int nofs_flag;
+ struct btrfs_qgroup_list *prealloc = NULL;
int ret = 0;
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
- /* We hold a transaction handle open, must do a NOFS allocation. */
- nofs_flag = memalloc_nofs_save();
- tmp = ulist_alloc(GFP_KERNEL);
- memalloc_nofs_restore(nofs_flag);
- if (!tmp)
- return -ENOMEM;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1501,6 +1593,11 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
}
+ prealloc = kzalloc(sizeof(*list), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = add_qgroup_relation_item(trans, src, dst);
if (ret)
goto out;
@@ -1512,16 +1609,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
spin_lock(&fs_info->qgroup_lock);
- ret = __add_relation_rb(member, parent);
+ ret = __add_relation_rb(prealloc, member, parent);
+ prealloc = NULL;
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
}
- ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
+ ret = quick_update_accounting(fs_info, src, dst, 1);
spin_unlock(&fs_info->qgroup_lock);
out:
+ kfree(prealloc);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
- ulist_free(tmp);
return ret;
}
@@ -1532,19 +1630,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct ulist *tmp;
bool found = false;
- unsigned int nofs_flag;
int ret = 0;
int ret2;
- /* We hold a transaction handle open, must do a NOFS allocation. */
- nofs_flag = memalloc_nofs_save();
- tmp = ulist_alloc(GFP_KERNEL);
- memalloc_nofs_restore(nofs_flag);
- if (!tmp)
- return -ENOMEM;
-
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
@@ -1582,11 +1671,10 @@ delete_item:
if (found) {
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
- ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
+ ret = quick_update_accounting(fs_info, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
}
out:
- ulist_free(tmp);
return ret;
}
@@ -1608,8 +1696,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
+ return 0;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1622,21 +1714,25 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
ret = add_qgroup_item(trans, quota_root, qgroupid);
if (ret)
goto out;
spin_lock(&fs_info->qgroup_lock);
- qgroup = add_qgroup_rb(fs_info, qgroupid);
+ qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
+ prealloc = NULL;
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- goto out;
- }
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ kfree(prealloc);
return ret;
}
@@ -1771,6 +1867,17 @@ out:
return ret;
}
+/*
+ * Inform qgroup to trace one dirty extent, its info is recorded in @record.
+ * So qgroup can account it at transaction committing time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocated memory,
+ * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
@@ -1780,6 +1887,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
+ if (!btrfs_qgroup_full_accounting(fs_info))
+ return 0;
+
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
@@ -1806,12 +1916,35 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * Post handler after qgroup_trace_extent_nolock().
+ *
+ * NOTE: Current qgroup does the expensive backref walk at transaction
+ * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
+ * new transaction.
+ * This is designed to allow btrfs_find_all_roots() to get correct new_roots
+ * result.
+ *
+ * However for old_roots there is no need to do backref walk at that time,
+ * since we search commit roots to walk backref and result will always be
+ * correct.
+ *
+ * Due to the nature of no lock version, we can't do backref there.
+ * So we must call btrfs_qgroup_trace_extent_post() after exiting
+ * spinlock context.
+ *
+ * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
+ * using current root, then we can move all expensive backref walk out of
+ * transaction committing, but not now as qgroup accounting will be wrong again.
+ */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct btrfs_backref_walk_ctx ctx = { 0 };
int ret;
+ if (!btrfs_qgroup_full_accounting(trans->fs_info))
+ return 0;
/*
* We are always called in a context where we are already holding a
* transaction handle. Often we are called when adding a data delayed
@@ -1859,6 +1992,19 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * Inform qgroup to trace one dirty extent, specified by @bytenr and
+ * @num_bytes.
+ * So qgroup can account it at commit trans time.
+ *
+ * Better encapsulated version, with memory allocation and backref walk for
+ * commit roots.
+ * So this can sleep.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes)
{
@@ -1867,8 +2013,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct btrfs_delayed_ref_root *delayed_refs;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
- || bytenr == 0 || num_bytes == 0)
+ if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
return 0;
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record)
@@ -1889,6 +2034,12 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
return btrfs_qgroup_trace_extent_post(trans, record);
}
+/*
+ * Inform qgroup to trace all leaf items of data
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM)
+ */
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
@@ -1900,7 +2051,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
u64 bytenr, num_bytes;
/* We can be called directly from walk_up_proc() */
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
for (i = 0; i < nr; i++) {
@@ -2276,7 +2427,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
int level;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/* Wrong parameter order */
@@ -2319,6 +2470,16 @@ out:
return ret;
}
+/*
+ * Inform qgroup to trace a whole subtree, including all its child tree
+ * blocks and data.
+ * The root tree block is specified by @root_eb.
+ *
+ * Normally used by relocation(tree block swap) and subvolume deletion.
+ *
+ * Return 0 for success
+ * Return <0 for error(ENOMEM or tree search error)
+ */
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
@@ -2333,7 +2494,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
BUG_ON(root_eb == NULL);
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
spin_lock(&fs_info->qgroup_lock);
@@ -2445,62 +2606,64 @@ out:
return ret;
}
+static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
+{
+ if (!list_empty(&qgroup->nested_iterator))
+ return;
+
+ list_add_tail(&qgroup->nested_iterator, head);
+}
+
+static void qgroup_iterator_nested_clean(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
+ list_del_init(&qgroup->nested_iterator);
+ }
+}
+
#define UPDATE_NEW 0
#define UPDATE_OLD 1
/*
* Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
-static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- struct ulist *qgroups, u64 seq, int update_old)
+static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct list_head *qgroups,
+ u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
- int ret = 0;
if (!roots)
- return 0;
+ return;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
+ LIST_HEAD(tmp);
+
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
- ulist_reinit(tmp);
- ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+ qgroup_iterator_nested_add(qgroups, qg);
+ qgroup_iterator_add(&tmp, qg);
+ list_for_each_entry(qg, &tmp, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(tmp_unode);
if (update_old)
btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+
list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(qgroups, glist->group->qgroupid,
- qgroup_to_aux(glist->group),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, glist->group->qgroupid,
- qgroup_to_aux(glist->group),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
+ qgroup_iterator_nested_add(qgroups, glist->group);
+ qgroup_iterator_add(&tmp, glist->group);
}
}
+ qgroup_iterator_clean(&tmp);
}
- return 0;
}
/*
@@ -2539,22 +2702,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* But this time we don't need to consider other things, the codes and logic
* is easy to understand now.
*/
-static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
- struct ulist *qgroups,
- u64 nr_old_roots,
- u64 nr_new_roots,
- u64 num_bytes, u64 seq)
+static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct list_head *qgroups, u64 nr_old_roots,
+ u64 nr_new_roots, u64 num_bytes, u64 seq)
{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- u64 cur_new_count, cur_old_count;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(qgroups, &uiter))) {
+ list_for_each_entry(qg, qgroups, nested_iterator) {
+ u64 cur_new_count, cur_old_count;
bool dirty = false;
- qg = unode_aux_to_qgroup(unode);
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
@@ -2625,7 +2782,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
if (dirty)
qgroup_dirty(fs_info, qg);
}
- return 0;
}
/*
@@ -2662,8 +2818,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct ulist *new_roots)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct ulist *qgroups = NULL;
- struct ulist *tmp = NULL;
+ LIST_HEAD(qgroups);
u64 seq;
u64 nr_new_roots = 0;
u64 nr_old_roots = 0;
@@ -2673,7 +2828,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
* If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (!btrfs_qgroup_full_accounting(fs_info) ||
fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
goto out_free;
@@ -2697,17 +2852,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
num_bytes, nr_old_roots, nr_new_roots);
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups) {
- ret = -ENOMEM;
- goto out_free;
- }
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp) {
- ret = -ENOMEM;
- goto out_free;
- }
-
mutex_lock(&fs_info->qgroup_rescan_lock);
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
@@ -2722,29 +2866,21 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
seq = fs_info->qgroup_seq;
/* Update old refcnts using old_roots */
- ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
- UPDATE_OLD);
- if (ret < 0)
- goto out;
+ qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
/* Update new refcnts using new_roots */
- ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
- UPDATE_NEW);
- if (ret < 0)
- goto out;
+ qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
- qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
num_bytes, seq);
/*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
-out:
spin_unlock(&fs_info->qgroup_lock);
out_free:
- ulist_free(tmp);
- ulist_free(qgroups);
+ qgroup_iterator_nested_clean(&qgroups);
ulist_free(old_roots);
ulist_free(new_roots);
return ret;
@@ -2761,6 +2897,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
u64 qgroup_to_skip;
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return 0;
+
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
@@ -2876,7 +3015,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
qgroup_mark_inconsistent(fs_info);
spin_lock(&fs_info->qgroup_lock);
}
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (btrfs_qgroup_enabled(fs_info))
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
else
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -2889,6 +3028,47 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
return ret;
}
+static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
+ u64 inode_rootid,
+ struct btrfs_qgroup_inherit **inherit)
+{
+ int i = 0;
+ u64 num_qgroups = 0;
+ struct btrfs_qgroup *inode_qg;
+ struct btrfs_qgroup_list *qg_list;
+ struct btrfs_qgroup_inherit *res;
+ size_t struct_sz;
+ u64 *qgids;
+
+ if (*inherit)
+ return -EEXIST;
+
+ inode_qg = find_qgroup_rb(fs_info, inode_rootid);
+ if (!inode_qg)
+ return -ENOENT;
+
+ num_qgroups = list_count_nodes(&inode_qg->groups);
+
+ if (!num_qgroups)
+ return 0;
+
+ struct_sz = struct_size(res, qgroups, num_qgroups);
+ if (struct_sz == SIZE_MAX)
+ return -ERANGE;
+
+ res = kzalloc(struct_sz, GFP_NOFS);
+ if (!res)
+ return -ENOMEM;
+ res->num_qgroups = num_qgroups;
+ qgids = res->qgroups;
+
+ list_for_each_entry(qg_list, &inode_qg->groups, next_group)
+ qgids[i] = qg_list->group->qgroupid;
+
+ *inherit = res;
+ return 0;
+}
+
/*
* Copy the accounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
@@ -2896,7 +3076,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
* when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
- u64 objectid, struct btrfs_qgroup_inherit *inherit)
+ u64 objectid, u64 inode_rootid,
+ struct btrfs_qgroup_inherit *inherit)
{
int ret = 0;
int i;
@@ -2906,10 +3087,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
+ struct btrfs_qgroup *prealloc;
+ struct btrfs_qgroup_list **qlist_prealloc = NULL;
+ bool free_inherit = false;
bool need_rescan = false;
u32 level_size = 0;
u64 nums;
+ prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
+ if (!prealloc)
+ return -ENOMEM;
+
/*
* There are only two callers of this function.
*
@@ -2929,7 +3117,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_enabled(fs_info))
goto out;
quota_root = fs_info->quota_root;
@@ -2938,6 +3126,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto out;
}
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
+ ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
+ if (ret)
+ goto out;
+ free_inherit = true;
+ }
+
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
@@ -2982,16 +3177,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto out;
}
ret = 0;
- }
+ qlist_prealloc = kcalloc(inherit->num_qgroups,
+ sizeof(struct btrfs_qgroup_list *),
+ GFP_NOFS);
+ if (!qlist_prealloc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (int i = 0; i < inherit->num_qgroups; i++) {
+ qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
+ GFP_NOFS);
+ if (!qlist_prealloc[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ }
spin_lock(&fs_info->qgroup_lock);
- dstgroup = add_qgroup_rb(fs_info, objectid);
- if (IS_ERR(dstgroup)) {
- ret = PTR_ERR(dstgroup);
- goto unlock;
- }
+ dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
+ prealloc = NULL;
if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
dstgroup->lim_flags = inherit->lim.flags;
@@ -3003,7 +3210,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
qgroup_dirty(fs_info, dstgroup);
}
- if (srcid) {
+ if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
goto unlock;
@@ -3038,7 +3245,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
if (*i_qgroups) {
- ret = add_relation_rb(fs_info, objectid, *i_qgroups);
+ ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
+ *i_qgroups);
+ qlist_prealloc[i] = NULL;
if (ret)
goto unlock;
}
@@ -3102,6 +3311,14 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
qgroup_mark_inconsistent(fs_info);
+ if (qlist_prealloc) {
+ for (int i = 0; i < inherit->num_qgroups; i++)
+ kfree(qlist_prealloc[i]);
+ kfree(qlist_prealloc);
+ }
+ if (free_inherit)
+ kfree(inherit);
+ kfree(prealloc);
return ret;
}
@@ -3125,8 +3342,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ LIST_HEAD(qgroup_list);
if (!is_fstree(ref_root))
return 0;
@@ -3146,49 +3362,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
if (!qgroup)
goto out;
- /*
- * in a first step, we check all affected qgroups if any limits would
- * be exceeded
- */
- ulist_reinit(fs_info->qgroup_ulist);
- ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- qgroup_to_aux(qgroup), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(unode);
-
- if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
ret = -EDQUOT;
goto out;
}
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(fs_info->qgroup_ulist,
- glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ list_for_each_entry(glist, &qgroup->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
+
ret = 0;
/*
* no limits exceeded, now record the reservation into all qgroups
*/
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
-
- qg = unode_aux_to_qgroup(unode);
-
- qgroup_rsv_add(fs_info, qg, num_bytes, type);
- }
+ list_for_each_entry(qgroup, &qgroup_list, iterator)
+ qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
out:
+ qgroup_iterator_clean(&qgroup_list);
spin_unlock(&fs_info->qgroup_lock);
return ret;
}
@@ -3207,9 +3402,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_qgroup *qgroup;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
+ LIST_HEAD(qgroup_list);
if (!is_fstree(ref_root))
return;
@@ -3237,30 +3430,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
*/
num_bytes = qgroup->rsv.values[type];
- ulist_reinit(fs_info->qgroup_ulist);
- ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- qgroup_to_aux(qgroup), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(unode);
-
- qgroup_rsv_release(fs_info, qg, num_bytes, type);
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(fs_info->qgroup_ulist,
- glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
+ qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
}
-
out:
+ qgroup_iterator_clean(&qgroup_list);
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3295,6 +3475,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
int slot;
int ret;
+ if (!btrfs_qgroup_full_accounting(fs_info))
+ return 1;
+
mutex_lock(&fs_info->qgroup_rescan_lock);
extent_root = btrfs_extent_root(fs_info,
fs_info->qgroup_rescan_progress.objectid);
@@ -3375,10 +3558,15 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
- return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
- !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
+ if (btrfs_fs_closing(fs_info))
+ return true;
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ return true;
+ if (!btrfs_qgroup_enabled(fs_info))
+ return true;
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
+ return true;
+ return false;
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3392,6 +3580,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
bool stopped = false;
bool did_leaf_rescans = false;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ return;
+
path = btrfs_alloc_path();
if (!path)
goto out;
@@ -3495,6 +3686,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
{
int ret = 0;
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+ btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
+ return -EINVAL;
+ }
+
if (!init_flags) {
/* we're resuming qgroup rescan at mount time */
if (!(fs_info->qgroup_flags &
@@ -3525,7 +3721,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
- } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
/* Quota disable is in progress */
ret = -EBUSY;
}
@@ -3546,7 +3742,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
mutex_unlock(&fs_info->qgroup_rescan_lock);
btrfs_init_work(&fs_info->qgroup_rescan_work,
- btrfs_qgroup_rescan_worker, NULL, NULL);
+ btrfs_qgroup_rescan_worker, NULL);
return 0;
}
@@ -3784,7 +3980,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
u64 to_reserve;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
+ if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid) || len == 0)
return 0;
@@ -3916,8 +4112,12 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
- return 0;
+ if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
+ extent_changeset_init(&changeset);
+ return clear_record_extent_bits(&inode->io_tree, start,
+ start + len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+ }
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
@@ -4027,7 +4227,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid) || num_bytes == 0)
return 0;
@@ -4064,11 +4264,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
}
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid))
return;
@@ -4084,7 +4288,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid))
return;
@@ -4104,9 +4308,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
struct btrfs_qgroup *qgroup;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
+ LIST_HEAD(qgroup_list);
if (num_bytes == 0)
return;
@@ -4117,39 +4319,35 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
qgroup = find_qgroup_rb(fs_info, ref_root);
if (!qgroup)
goto out;
- ulist_reinit(fs_info->qgroup_ulist);
- ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- qgroup_to_aux(qgroup), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
- qg = unode_aux_to_qgroup(unode);
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
- qgroup_rsv_release(fs_info, qg, num_bytes,
+ qgroup_rsv_release(fs_info, qgroup, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
- qgroup_rsv_add(fs_info, qg, num_bytes,
+ qgroup_rsv_add(fs_info, qgroup, num_bytes,
BTRFS_QGROUP_RSV_META_PERTRANS);
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(fs_info->qgroup_ulist,
- glist->group->qgroupid,
- qgroup_to_aux(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+
+ list_for_each_entry(glist, &qgroup->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
}
out:
+ qgroup_iterator_clean(&qgroup_list);
spin_unlock(&fs_info->qgroup_lock);
}
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
!is_fstree(root->root_key.objectid))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
@@ -4257,7 +4455,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
@@ -4367,7 +4565,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
int ret = 0;
int i;
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
return 0;
@@ -4450,3 +4648,53 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
}
*root = RB_ROOT;
}
+
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+ struct btrfs_squota_delta *delta)
+{
+ int ret;
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup *qg;
+ LIST_HEAD(qgroup_list);
+ u64 root = delta->root;
+ u64 num_bytes = delta->num_bytes;
+ const int sign = (delta->is_inc ? 1 : -1);
+
+ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
+ return 0;
+
+ if (!is_fstree(root))
+ return 0;
+
+ /* If the extent predates enabling quotas, don't count it. */
+ if (delta->generation < fs_info->qgroup_enable_gen)
+ return 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, root);
+ if (!qgroup) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = 0;
+ qgroup_iterator_add(&qgroup_list, qgroup);
+ list_for_each_entry(qg, &qgroup_list, iterator) {
+ struct btrfs_qgroup_list *glist;
+
+ qg->excl += num_bytes * sign;
+ qg->rfer += num_bytes * sign;
+ qgroup_dirty(fs_info, qg);
+
+ list_for_each_entry(glist, &qg->groups, next_group)
+ qgroup_iterator_add(&qgroup_list, glist->group);
+ }
+ qgroup_iterator_clean(&qgroup_list);
+
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ if (!ret && delta->rsv_bytes)
+ btrfs_qgroup_free_refroot(fs_info, root, delta->rsv_bytes,
+ BTRFS_QGROUP_RSV_DATA);
+ return ret;
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 7bffa10589d6..855a4f978761 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -101,8 +101,15 @@
* subtree rescan for them.
*/
-#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
-#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+/*
+ * These flags share the flags field of the btrfs_qgroup_status_item with the
+ * persisted flags defined in btrfs_tree.h.
+ *
+ * To minimize the chance of collision with new persisted status flags, these
+ * count backwards from the MSB.
+ */
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62)
/*
* Record a dirty extent, and info qgroup to update quota on it
@@ -220,6 +227,33 @@ struct btrfs_qgroup {
struct list_head groups; /* groups this group is member of */
struct list_head members; /* groups that are members of this group */
struct list_head dirty; /* dirty groups */
+
+ /*
+ * For qgroup iteration usage.
+ *
+ * The iteration list should always be empty until qgroup_iterator_add()
+ * is called. And should be reset to empty after the iteration is
+ * finished.
+ */
+ struct list_head iterator;
+
+ /*
+ * For nested iterator usage.
+ *
+ * Here we support at most one level of nested iterator calls like:
+ *
+ * LIST_HEAD(all_qgroups);
+ * {
+ * LIST_HEAD(local_qgroups);
+ * qgroup_iterator_add(local_qgroups, qg);
+ * qgroup_iterator_nested_add(all_qgroups, qg);
+ * do_some_work(local_qgroups);
+ * qgroup_iterator_clean(local_qgroups);
+ * }
+ * do_some_work(all_qgroups);
+ * qgroup_iterator_nested_clean(all_qgroups);
+ */
+ struct list_head nested_iterator;
struct rb_node node; /* tree of qgroups */
/*
@@ -235,6 +269,21 @@ struct btrfs_qgroup {
struct kobject kobj;
};
+struct btrfs_squota_delta {
+ /* The fstree root this delta counts against. */
+ u64 root;
+ /* The number of bytes in the extent being counted. */
+ u64 num_bytes;
+ /* The number of bytes reserved for this extent. */
+ u64 rsv_bytes;
+ /* The generation the extent was created in. */
+ u64 generation;
+ /* Whether we are using or freeing the extent. */
+ bool is_inc;
+ /* Whether the extent is data or metadata. */
+ bool is_data;
+};
+
static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
{
return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
@@ -249,14 +298,23 @@ enum {
ENUM_BIT(QGROUP_FREE),
};
-int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
+enum btrfs_qgroup_mode {
+ BTRFS_QGROUP_MODE_DISABLED,
+ BTRFS_QGROUP_MODE_FULL,
+ BTRFS_QGROUP_MODE_SIMPLE
+};
+
+enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info);
+int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
bool interruptible);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
- u64 dst);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
@@ -267,80 +325,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-/*
- * Inform qgroup to trace one dirty extent, its info is recorded in @record.
- * So qgroup can account it at transaction committing time.
- *
- * No lock version, caller must acquire delayed ref lock and allocated memory,
- * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
- *
- * Return 0 for success insert
- * Return >0 for existing record, caller can free @record safely.
- * Error is not possible
- */
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record);
-
-/*
- * Post handler after qgroup_trace_extent_nolock().
- *
- * NOTE: Current qgroup does the expensive backref walk at transaction
- * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
- * new transaction.
- * This is designed to allow btrfs_find_all_roots() to get correct new_roots
- * result.
- *
- * However for old_roots there is no need to do backref walk at that time,
- * since we search commit roots to walk backref and result will always be
- * correct.
- *
- * Due to the nature of no lock version, we can't do backref there.
- * So we must call btrfs_qgroup_trace_extent_post() after exiting
- * spinlock context.
- *
- * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
- * using current root, then we can move all expensive backref walk out of
- * transaction committing, but not now as qgroup accounting will be wrong again.
- */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
-
-/*
- * Inform qgroup to trace one dirty extent, specified by @bytenr and
- * @num_bytes.
- * So qgroup can account it at commit trans time.
- *
- * Better encapsulated version, with memory allocation and backref walk for
- * commit roots.
- * So this can sleep.
- *
- * Return 0 if the operation is done.
- * Return <0 for error, like memory allocation failure or invalid parameter
- * (NULL trans)
- */
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes);
-
-/*
- * Inform qgroup to trace all leaf items of data
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM)
- */
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb);
-/*
- * Inform qgroup to trace a whole subtree, including all its child tree
- * blocks and data.
- * The root tree block is specified by @root_eb.
- *
- * Normally used by relocation(tree block swap) and subvolume deletion.
- *
- * Return 0 for success
- * Return <0 for error(ENOMEM or tree search error)
- */
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
@@ -350,7 +344,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
- u64 objectid, struct btrfs_qgroup_inherit *inherit);
+ u64 objectid, u64 inode_rootid,
+ struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
@@ -408,20 +403,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
BTRFS_QGROUP_RSV_META_PREALLOC);
}
-/*
- * Per-transaction meta reservation should be all freed at transaction commit
- * time
- */
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
-
-/*
- * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
- *
- * This is called when preallocated meta reservation needs to be used.
- * Normally after btrfs_join_transaction() call.
- */
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
@@ -439,5 +422,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
+ struct btrfs_squota_delta *delta);
#endif
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..944e8f1862aa
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/btrfs_tree.h>
+#include "ctree.h"
+#include "fs.h"
+#include "accessors.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+#include "misc.h"
+#include "print-tree.h"
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ u64 found_start;
+ u64 found_end;
+ u64 end = start + length;
+ int slot;
+ int ret;
+
+ if (!stripe_root)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (1) {
+ key.objectid = start;
+ key.type = BTRFS_RAID_STRIPE_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ if (path->slots[0] == 0)
+ break;
+ path->slots[0]--;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ found_start = key.objectid;
+ found_end = found_start + key.offset;
+
+ /* That stripe ends before we start, we're done. */
+ if (found_end <= start)
+ break;
+
+ trace_btrfs_raid_extent_delete(fs_info, start, end,
+ found_start, found_end);
+
+ ASSERT(found_start >= start && found_end <= end);
+ ret = btrfs_del_item(trans, stripe_root, path);
+ if (ret)
+ break;
+
+ btrfs_release_path(path);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key stripe_key;
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
+ u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
+ struct btrfs_stripe_extent *stripe_extent;
+ const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
+ int ret;
+
+ stripe_extent = kzalloc(item_size, GFP_NOFS);
+ if (!stripe_extent) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ btrfs_end_transaction(trans);
+ return -ENOMEM;
+ }
+
+ trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
+ num_stripes);
+ btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
+ for (int i = 0; i < num_stripes; i++) {
+ u64 devid = bioc->stripes[i].dev->devid;
+ u64 physical = bioc->stripes[i].physical;
+ u64 length = bioc->stripes[i].length;
+ struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
+
+ if (length == 0)
+ length = bioc->size;
+
+ btrfs_set_stack_raid_stride_devid(raid_stride, devid);
+ btrfs_set_stack_raid_stride_physical(raid_stride, physical);
+ }
+
+ stripe_key.objectid = bioc->logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = bioc->size;
+
+ ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
+ item_size);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ kfree(stripe_extent);
+
+ return ret;
+}
+
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *ordered_extent)
+{
+ struct btrfs_io_context *bioc;
+ int ret;
+
+ if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
+ return 0;
+
+ list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret)
+ return ret;
+ }
+
+ while (!list_empty(&ordered_extent->bioc_list)) {
+ bioc = list_first_entry(&ordered_extent->bioc_list,
+ typeof(*bioc), rst_ordered_entry);
+ list_del(&bioc->rst_ordered_entry);
+ btrfs_put_bioc(bioc);
+ }
+
+ return ret;
+}
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index, struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_root *stripe_root = fs_info->stripe_root;
+ struct btrfs_stripe_extent *stripe_extent;
+ struct btrfs_key stripe_key;
+ struct btrfs_key found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ const u64 end = logical + *length;
+ int num_stripes;
+ u8 encoding;
+ u64 offset;
+ u64 found_logical;
+ u64 found_length;
+ u64 found_end;
+ int slot;
+ int ret;
+
+ stripe_key.objectid = logical;
+ stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+ stripe_key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (stripe->is_scrub) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+
+ ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
+ if (ret < 0)
+ goto free_path;
+ if (ret) {
+ if (path->slots[0] != 0)
+ path->slots[0]--;
+ }
+
+ while (1) {
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ found_logical = found_key.objectid;
+ found_length = found_key.offset;
+ found_end = found_logical + found_length;
+
+ if (found_logical > end) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (in_range(logical, found_logical, found_length))
+ break;
+
+ ret = btrfs_next_item(stripe_root, path);
+ if (ret)
+ goto out;
+ }
+
+ offset = logical - found_logical;
+
+ /*
+ * If we have a logically contiguous, but physically non-continuous
+ * range, we need to split the bio. Record the length after which we
+ * must split the bio.
+ */
+ if (end > found_end)
+ *length -= end - found_end;
+
+ num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+ stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+ encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
+
+ if (encoding != btrfs_bg_flags_to_raid_index(map_type)) {
+ ret = -EUCLEAN;
+ btrfs_handle_fs_error(fs_info, ret,
+ "on-disk stripe encoding %d doesn't match RAID index %d",
+ encoding,
+ btrfs_bg_flags_to_raid_index(map_type));
+ goto out;
+ }
+
+ for (int i = 0; i < num_stripes; i++) {
+ struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
+ u64 devid = btrfs_raid_stride_devid(leaf, stride);
+ u64 physical = btrfs_raid_stride_physical(leaf, stride);
+
+ if (devid != stripe->dev->devid)
+ continue;
+
+ if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
+ continue;
+
+ stripe->physical = physical + offset;
+
+ trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
+ stripe->physical, devid);
+
+ ret = 0;
+ goto free_path;
+ }
+
+ /* If we're here, we haven't found the requested devid in the stripe. */
+ ret = -ENOENT;
+out:
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret && ret != -EIO && !stripe->is_scrub) {
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ btrfs_print_tree(leaf, 1);
+ btrfs_err(fs_info,
+ "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
+ logical, logical + *length, stripe->dev->devid,
+ btrfs_bg_type_to_raid_name(map_type));
+ }
+free_path:
+ btrfs_free_path(path);
+
+ return ret;
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..cdb58b38fcb5
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID1_MASK | \
+ BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID10)
+
+struct btrfs_io_context;
+struct btrfs_io_stripe;
+struct btrfs_ordered_extent;
+struct btrfs_trans_handle;
+
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length);
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length, u64 map_type,
+ u32 stripe_index, struct btrfs_io_stripe *stripe);
+int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_ordered_extent *ordered_extent);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
+ u64 map_type)
+{
+ u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
+ return false;
+
+ if (type != BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK)
+ return true;
+
+ return false;
+}
+
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+ return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
+ sizeof(struct btrfs_raid_stride);
+}
+
+#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 95d28497de7c..1f62976bee82 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
ret = add_shared_data_ref(fs_info, offset, count,
key->objectid, key->offset);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ break;
default:
btrfs_err(fs_info, "invalid key type in iref");
ret = -EINVAL;
@@ -652,7 +655,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
}
/*
- * btrfs_ref_tree_mod: called when we modify a ref for a bytenr
+ * Called when we modify a ref for a bytenr.
*
* This will add an action item to the given bytenr and do sanity checks to make
* sure we haven't messed something up. If we are making a new allocation and
@@ -681,10 +684,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (generic_ref->type == BTRFS_REF_METADATA) {
if (!parent)
- ref_root = generic_ref->tree_ref.owning_root;
+ ref_root = generic_ref->tree_ref.ref_root;
owner = generic_ref->tree_ref.level;
} else if (!parent) {
- ref_root = generic_ref->data_ref.owning_root;
+ ref_root = generic_ref->data_ref.ref_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 65d2bd6910f2..f88b0c2ac3fe 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -25,12 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
const u64 olen,
int no_time_update)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
if (!no_time_update) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
}
/*
* We round up to the block size at eof when determining which
@@ -43,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9951a0caf5bb..f5d9e5f74a52 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -111,8 +111,8 @@ struct tree_block {
}; /* Use rb_simple_node for search/insert */
u64 owner;
struct btrfs_key key;
- unsigned int level:8;
- unsigned int key_ready:1;
+ u8 level;
+ bool key_ready;
};
#define MAX_EXTENTS 128
@@ -122,6 +122,13 @@ struct file_extent_cluster {
u64 end;
u64 boundary[MAX_EXTENTS];
unsigned int nr;
+ u64 owning_root;
+};
+
+/* Stages of data relocation. */
+enum reloc_stage {
+ MOVE_DATA_EXTENTS,
+ UPDATE_DATA_PTRS
};
struct reloc_control {
@@ -155,16 +162,12 @@ struct reloc_control {
u64 search_start;
u64 extents_found;
- unsigned int stage:8;
- unsigned int create_reloc_tree:1;
- unsigned int merge_reloc_tree:1;
- unsigned int found_file_extent:1;
+ enum reloc_stage stage;
+ bool create_reloc_tree;
+ bool merge_reloc_tree;
+ bool found_file_extent;
};
-/* stages of data relocation */
-#define MOVE_DATA_EXTENTS 0
-#define UPDATE_DATA_PTRS 1
-
static void mark_block_processed(struct reloc_control *rc,
struct btrfs_backref_node *node)
{
@@ -180,13 +183,6 @@ static void mark_block_processed(struct reloc_control *rc,
node->processed = 1;
}
-
-static void mapping_tree_init(struct mapping_tree *tree)
-{
- tree->rb_root = RB_ROOT;
- spin_lock_init(&tree->lock);
-}
-
/*
* walk up backref nodes until reach node presents tree root
*/
@@ -299,7 +295,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
return 1;
}
-static bool reloc_root_is_dead(struct btrfs_root *root)
+static bool reloc_root_is_dead(const struct btrfs_root *root)
{
/*
* Pair with set_bit/clear_bit in clean_dirty_subvols and
@@ -320,7 +316,7 @@ static bool reloc_root_is_dead(struct btrfs_root *root)
* from no reloc root. But btrfs_should_ignore_reloc_root() below is a
* special case.
*/
-static bool have_reloc_root(struct btrfs_root *root)
+static bool have_reloc_root(const struct btrfs_root *root)
{
if (reloc_root_is_dead(root))
return false;
@@ -329,31 +325,30 @@ static bool have_reloc_root(struct btrfs_root *root)
return true;
}
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- return 0;
+ return false;
/* This root has been merged with its reloc tree, we can ignore it */
if (reloc_root_is_dead(root))
- return 1;
+ return true;
reloc_root = root->reloc_root;
if (!reloc_root)
- return 0;
+ return false;
if (btrfs_header_generation(reloc_root->commit_root) ==
root->fs_info->running_transaction->transid)
- return 0;
+ return false;
/*
- * if there is reloc tree and it was created in previous
- * transaction backref lookup can find the reloc tree,
- * so backref node for the fs tree root is useless for
- * relocation.
+ * If there is reloc tree and it was created in previous transaction
+ * backref lookup can find the reloc tree, so backref node for the fs
+ * tree root is useless for relocation.
*/
- return 1;
+ return true;
}
/*
@@ -466,6 +461,7 @@ static bool handle_useless_nodes(struct reloc_control *rc,
* cached.
*/
static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
+ struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct btrfs_key *node_key,
int level, u64 bytenr)
{
@@ -499,8 +495,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
/* Breadth-first search to build backref cache */
do {
- ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
- cur);
+ ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
+ node_key, cur);
if (ret < 0) {
err = ret;
goto out;
@@ -546,7 +542,7 @@ out:
*/
static int clone_backref_node(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
- struct btrfs_root *src,
+ const struct btrfs_root *src,
struct btrfs_root *dest)
{
struct btrfs_root *reloc_root = src->reloc_root;
@@ -631,7 +627,7 @@ fail:
/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
-static int __must_check __add_reloc_root(struct btrfs_root *root)
+static int __add_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
@@ -1158,7 +1154,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- num_bytes, parent);
+ num_bytes, parent, root->root_key.objectid);
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
key.objectid, key.offset,
root->root_key.objectid, false);
@@ -1169,7 +1165,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
}
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
- num_bytes, parent);
+ num_bytes, parent, root->root_key.objectid);
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
key.objectid, key.offset,
root->root_key.objectid, false);
@@ -1180,15 +1176,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
}
}
if (dirty)
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (inode)
btrfs_add_delayed_iput(BTRFS_I(inode));
return ret;
}
-static noinline_for_stack
-int memcmp_node_keys(struct extent_buffer *eb, int slot,
- struct btrfs_path *path, int level)
+static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
+ int slot, const struct btrfs_path *path,
+ int level)
{
struct btrfs_disk_key key1;
struct btrfs_disk_key key2;
@@ -1373,16 +1369,17 @@ again:
*/
btrfs_set_node_blockptr(parent, slot, new_bytenr);
btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
- btrfs_mark_buffer_dirty(parent);
+ btrfs_mark_buffer_dirty(trans, parent);
btrfs_set_node_blockptr(path->nodes[level],
path->slots[level], old_bytenr);
btrfs_set_node_ptr_generation(path->nodes[level],
path->slots[level], old_ptr_gen);
- btrfs_mark_buffer_dirty(path->nodes[level]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[level]);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
- blocksize, path->nodes[level]->start);
+ blocksize, path->nodes[level]->start,
+ src->root_key.objectid);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
0, true);
ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1391,7 +1388,7 @@ again:
break;
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
- blocksize, 0);
+ blocksize, 0, dest->root_key.objectid);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
true);
ret = btrfs_inc_extent_ref(trans, &ref);
@@ -1400,8 +1397,9 @@ again:
break;
}
+ /* We don't know the real owning_root, use 0. */
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
- blocksize, path->nodes[level]->start);
+ blocksize, path->nodes[level]->start, 0);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
0, true);
ret = btrfs_free_extent(trans, &ref);
@@ -1410,8 +1408,9 @@ again:
break;
}
+ /* We don't know the real owning_root, use 0. */
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
- blocksize, 0);
+ blocksize, 0, 0);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
0, true);
ret = btrfs_free_extent(trans, &ref);
@@ -1517,8 +1516,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
* [min_key, max_key)
*/
static int invalidate_extent_cache(struct btrfs_root *root,
- struct btrfs_key *min_key,
- struct btrfs_key *max_key)
+ const struct btrfs_key *min_key,
+ const struct btrfs_key *max_key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode = NULL;
@@ -1896,7 +1895,7 @@ again:
}
}
- rc->merge_reloc_tree = 1;
+ rc->merge_reloc_tree = true;
while (!list_empty(&rc->reloc_roots)) {
reloc_root = list_entry(rc->reloc_roots.next,
@@ -2516,11 +2515,12 @@ static int do_relocation(struct btrfs_trans_handle *trans,
node->eb->start);
btrfs_set_node_ptr_generation(upper->eb, slot,
trans->transid);
- btrfs_mark_buffer_dirty(upper->eb);
+ btrfs_mark_buffer_dirty(trans, upper->eb);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
- upper->eb->start);
+ upper->eb->start,
+ btrfs_header_owner(upper->eb));
btrfs_init_tree_ref(&ref, node->level,
btrfs_header_owner(upper->eb),
root->root_key.objectid, false);
@@ -2632,7 +2632,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
u32 blocksize = rc->extent_root->fs_info->nodesize;
if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
+ bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
return 1;
return 0;
}
@@ -2659,7 +2659,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
else
btrfs_node_key_to_cpu(eb, &block->key, 0);
free_extent_buffer(eb);
- block->key_ready = 1;
+ block->key_ready = true;
return 0;
}
@@ -2803,7 +2803,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Do tree relocation */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
- node = build_backref_tree(rc, &block->key,
+ node = build_backref_tree(trans, rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
err = PTR_ERR(node);
@@ -2829,7 +2829,7 @@ out_free_blocks:
static noinline_for_stack int prealloc_file_extent_cluster(
struct btrfs_inode *inode,
- struct file_extent_cluster *cluster)
+ const struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
@@ -2964,7 +2964,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
/*
* Allow error injection to test balance/relocation cancellation
*/
-noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req) ||
atomic_read(&fs_info->reloc_cancel_req) ||
@@ -2972,7 +2972,7 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
-static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
+static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
int cluster_nr)
{
/* Last extent, use cluster end directly */
@@ -2984,7 +2984,7 @@ static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
}
static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
- struct file_extent_cluster *cluster,
+ const struct file_extent_cluster *cluster,
int *cluster_nr, unsigned long page_index)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3119,7 +3119,7 @@ release_page:
}
static int relocate_file_extent_cluster(struct inode *inode,
- struct file_extent_cluster *cluster)
+ const struct file_extent_cluster *cluster)
{
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
@@ -3157,11 +3157,12 @@ out:
return ret;
}
-static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int relocate_data_extent(struct inode *inode,
+ const struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
{
int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
ret = relocate_file_extent_cluster(inode, cluster);
@@ -3170,8 +3171,38 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
cluster->nr = 0;
}
- if (!cluster->nr)
+ /*
+ * Under simple quotas, we set root->relocation_src_root when we find
+ * the extent. If adjacent extents have different owners, we can't merge
+ * them while relocating. Handle this by storing the owning root that
+ * started a cluster and if we see an extent from a different root break
+ * cluster formation (just like the above case of non-adjacent extents).
+ *
+ * Without simple quotas, relocation_src_root is always 0, so we should
+ * never see a mismatch, and it should have no effect on relocation
+ * clusters.
+ */
+ if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
+ u64 tmp = root->relocation_src_root;
+
+ /*
+ * root->relocation_src_root is the state that actually affects
+ * the preallocation we do here, so set it to the root owning
+ * the cluster we need to relocate.
+ */
+ root->relocation_src_root = cluster->owning_root;
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ /* And reset it back for the current extent's owning root. */
+ root->relocation_src_root = tmp;
+ }
+
+ if (!cluster->nr) {
cluster->start = extent_key->objectid;
+ cluster->owning_root = root->relocation_src_root;
+ }
else
BUG_ON(cluster->nr >= MAX_EXTENTS);
cluster->end = extent_key->objectid + extent_key->offset - 1;
@@ -3192,7 +3223,7 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
* the major work is getting the generation and level of the block
*/
static int add_tree_block(struct reloc_control *rc,
- struct btrfs_key *extent_key,
+ const struct btrfs_key *extent_key,
struct btrfs_path *path,
struct rb_root *blocks)
{
@@ -3277,7 +3308,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key.objectid = rc->extent_root->fs_info->nodesize;
block->key.offset = generation;
block->level = level;
- block->key_ready = 0;
+ block->key_ready = false;
block->owner = owner;
rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
@@ -3443,11 +3474,10 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
/*
* helper to find all tree blocks that reference a given data extent
*/
-static noinline_for_stack
-int add_data_references(struct reloc_control *rc,
- struct btrfs_key *extent_key,
- struct btrfs_path *path,
- struct rb_root *blocks)
+static noinline_for_stack int add_data_references(struct reloc_control *rc,
+ const struct btrfs_key *extent_key,
+ struct btrfs_path *path,
+ struct rb_root *blocks)
{
struct btrfs_backref_walk_ctx ctx = { 0 };
struct ulist_iterator leaf_uiter;
@@ -3621,7 +3651,7 @@ int prepare_to_relocate(struct reloc_control *rc)
if (ret)
return ret;
- rc->create_reloc_tree = 1;
+ rc->create_reloc_tree = true;
set_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
@@ -3701,6 +3731,21 @@ restart:
struct btrfs_extent_item);
flags = btrfs_extent_flags(path->nodes[0], ei);
+ /*
+ * If we are relocating a simple quota owned extent item, we
+ * need to note the owner on the reloc data root so that when
+ * we allocate the replacement item, we can attribute it to the
+ * correct eventual owner (rather than the reloc data root).
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
+ struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
+ u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
+ path->nodes[0],
+ path->slots[0]);
+
+ root->relocation_src_root = owning_root_id;
+ }
+
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
} else if (rc->stage == UPDATE_DATA_PTRS &&
@@ -3733,7 +3778,7 @@ restart:
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
- rc->found_file_extent = 1;
+ rc->found_file_extent = true;
ret = relocate_data_extent(rc->data_inode,
&key, &rc->cluster);
if (ret < 0) {
@@ -3770,7 +3815,7 @@ restart:
err = ret;
}
- rc->create_reloc_tree = 0;
+ rc->create_reloc_tree = false;
set_reloc_control(rc);
btrfs_backref_release_cache(&rc->backref_cache);
@@ -3788,7 +3833,7 @@ restart:
merge_reloc_roots(rc);
- rc->merge_reloc_tree = 0;
+ rc->merge_reloc_tree = false;
unset_reloc_control(rc);
btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
@@ -3834,7 +3879,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
return ret;
@@ -3873,9 +3918,9 @@ out:
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
*/
-static noinline_for_stack
-struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *group)
+static noinline_for_stack struct inode *create_reloc_inode(
+ struct btrfs_fs_info *fs_info,
+ const struct btrfs_block_group *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -3970,8 +4015,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->reloc_roots);
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
- btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
- mapping_tree_init(&rc->reloc_root_tree);
+ btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
+ rc->reloc_root_tree.rb_root = RB_ROOT;
+ spin_lock_init(&rc->reloc_root_tree.lock);
extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -4003,7 +4049,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
block_group->start, buf);
}
-static const char *stage_to_string(int stage)
+static const char *stage_to_string(enum reloc_stage stage)
{
if (stage == MOVE_DATA_EXTENTS)
return "move data extents";
@@ -4119,7 +4165,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
WARN_ON(ret && ret != -EAGAIN);
while (1) {
- int finishes_stage;
+ enum reloc_stage finishes_stage;
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
@@ -4302,7 +4348,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
goto out_unset;
}
- rc->merge_reloc_tree = 1;
+ rc->merge_reloc_tree = true;
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
@@ -4421,7 +4467,8 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
}
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf,
+ struct btrfs_root *root,
+ const struct extent_buffer *buf,
struct extent_buffer *cow)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4560,7 +4607,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
*
* Return U64_MAX if no running relocation.
*/
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info)
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
{
u64 logical = U64_MAX;
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 77d69f6ae967..5fb60f2deb53 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -10,15 +10,16 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf,
+ struct btrfs_root *root,
+ const struct extent_buffer *buf,
struct extent_buffer *cow);
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
-int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
-u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info);
+bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
+u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 859874579456..603ad1459368 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -51,7 +51,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
}
/*
- * btrfs_find_root - lookup the root by the key.
+ * Lookup the root by the key.
+ *
* root: the root of the root tree
* search_key: the key to search
* path: the path we search
@@ -191,7 +192,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
btrfs_free_path(path);
return ret;
@@ -438,7 +439,7 @@ again:
btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(leaf, name->name, ptr, name->len);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
@@ -485,7 +486,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
}
/*
- * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * Reserve space for subvolume operation.
+ *
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
@@ -508,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ if (btrfs_qgroup_enabled(fs_info)) {
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h
index cbbaca32126e..8b2c3859e464 100644
--- a/fs/btrfs/root-tree.h
+++ b/fs/btrfs/root-tree.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_ROOT_TREE_H
#define BTRFS_ROOT_TREE_H
+struct fscrypt_str;
+
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
@@ -18,10 +20,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_root_item *item);
-int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_key *key,
- struct btrfs_root_item *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_root_item *item);
int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
struct btrfs_path *path, struct btrfs_root_item *root_item,
struct btrfs_key *root_key);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b877203f1dc5..9ce5be21b036 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -16,7 +16,6 @@
#include "backref.h"
#include "extent_io.h"
#include "dev-replace.h"
-#include "check-integrity.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
@@ -24,6 +23,7 @@
#include "accessors.h"
#include "file-item.h"
#include "scrub.h"
+#include "raid-stripe-tree.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -897,7 +897,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
ASSERT(stripe->mirror_num >= 1);
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
stripe->logical, &mapped_len, &bioc,
- NULL, NULL, 1);
+ NULL, NULL);
/*
* If we failed, dev will be NULL, and later detailed reports
* will just be skipped.
@@ -1635,6 +1635,71 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
}
}
+static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
+ struct scrub_stripe *stripe)
+{
+ struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ struct btrfs_bio *bbio = NULL;
+ u64 stripe_len = BTRFS_STRIPE_LEN;
+ int mirror = stripe->mirror_num;
+ int i;
+
+ atomic_inc(&stripe->pending_io);
+
+ for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+ struct page *page = scrub_stripe_get_page(stripe, i);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+
+ /* The current sector cannot be merged, submit the bio. */
+ if (bbio &&
+ ((i > 0 &&
+ !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+ bbio->bio.bi_iter.bi_size >= stripe_len)) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bio(bbio, mirror);
+ bbio = NULL;
+ }
+
+ if (!bbio) {
+ struct btrfs_io_stripe io_stripe = {};
+ struct btrfs_io_context *bioc = NULL;
+ const u64 logical = stripe->logical +
+ (i << fs_info->sectorsize_bits);
+ int err;
+
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
+
+ io_stripe.is_scrub = true;
+ err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ &stripe_len, &bioc, &io_stripe,
+ &mirror);
+ btrfs_put_bioc(bioc);
+ if (err) {
+ btrfs_bio_end_io(bbio,
+ errno_to_blk_status(err));
+ return;
+ }
+ }
+
+ __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ }
+
+ if (bbio) {
+ ASSERT(bbio->bio.bi_iter.bi_size);
+ atomic_inc(&stripe->pending_io);
+ btrfs_submit_bio(bbio, mirror);
+ }
+
+ if (atomic_dec_and_test(&stripe->pending_io)) {
+ wake_up(&stripe->io_wait);
+ INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
+ queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+ }
+}
+
static void scrub_submit_initial_read(struct scrub_ctx *sctx,
struct scrub_stripe *stripe)
{
@@ -1646,6 +1711,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
ASSERT(stripe->mirror_num > 0);
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
+ if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
+ scrub_submit_extent_sector_read(sctx, stripe);
+ return;
+ }
+
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
@@ -1952,7 +2022,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
- &length, &bioc, NULL, NULL, 1);
+ &length, &bioc, NULL, NULL);
if (ret < 0) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
@@ -2717,7 +2787,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (scrub_dev->fs_devices != fs_info->fs_devices)
gen = scrub_dev->generation;
else
- gen = fs_info->last_trans_committed;
+ gen = btrfs_get_last_trans_committed(fs_info);
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3a566150c531..3b929f0e8f04 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -796,7 +796,7 @@ static int send_cmd(struct send_ctx *sctx)
put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
put_unaligned_le32(0, &hdr->crc);
- crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -5669,8 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
hdr->crc = 0;
- crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
- crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ crc = crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d7e8cd4f140c..571bb13587d5 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -345,8 +345,10 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_space_info *data_sinfo;
u64 profile;
u64 avail;
+ u64 data_chunk_size;
int factor;
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
@@ -364,6 +366,36 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
*/
factor = btrfs_bg_type_to_factor(profile);
avail = div_u64(avail, factor);
+ if (avail == 0)
+ return 0;
+
+ /*
+ * Calculate the data_chunk_size, space_info->chunk_size is the
+ * "optimal" chunk size based on the fs size. However when we actually
+ * allocate the chunk we will strip this down further, making it no more
+ * than 10% of the disk or 1G, whichever is smaller.
+ */
+ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+
+ /*
+ * Since data allocations immediately use block groups as part of the
+ * reservation, because we assume that data reservations will == actual
+ * usage, we could potentially overcommit and then immediately have that
+ * available space used by a data allocation, which could put us in a
+ * bind when we get close to filling the file system.
+ *
+ * To handle this simply remove the data_chunk_size from the available
+ * space. If we are relatively empty this won't affect our ability to
+ * overcommit much, and if we're very close to full it'll keep us from
+ * getting into a position where we've given ourselves very little
+ * metadata wiggle room.
+ */
+ if (avail <= data_chunk_size)
+ return 0;
+ avail -= data_chunk_size;
/*
* If we aren't flushing all things, let us overcommit up to
@@ -556,18 +588,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
return nr;
}
-static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
- u64 to_reclaim)
-{
- const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
- u64 nr;
-
- nr = div64_u64(to_reclaim, bytes);
- if (!nr)
- nr = 1;
- return nr;
-}
-
#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
@@ -749,10 +769,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
if (state == FLUSH_DELAYED_REFS_NR)
- nr = calc_delayed_refs_nr(fs_info, num_bytes);
+ btrfs_run_delayed_refs(trans, num_bytes);
else
- nr = 0;
- btrfs_run_delayed_refs(trans, nr);
+ btrfs_run_delayed_refs(trans, 0);
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
@@ -978,7 +997,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
}
/*
- * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
+ * We've exhausted our flushing, start failing tickets.
+ *
* @fs_info - fs_info for this fs
* @space_info - the space info we were flushing
*
@@ -1742,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* Try to reserve metadata bytes from the block_rsv's space.
*
* @fs_info: the filesystem
- * @block_rsv: block_rsv we're allocating for
+ * @space_info: the space_info we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
*
@@ -1754,21 +1774,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* space already.
*/
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
+ struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
- ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- block_rsv->space_info->flags,
- orig_bytes, 1);
+ space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, block_rsv->space_info,
- orig_bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
}
return ret;
}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 0bb9d14e60a8..92c595fed1b0 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
+#include <trace/events/btrfs.h>
#include "volumes.h"
/*
@@ -212,7 +213,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
+ struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 09bfe68d2ea3..6ecf78d09694 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
+#include <linux/security.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -129,9 +130,6 @@ enum {
Opt_inode_cache, Opt_noinode_cache,
/* Debugging options */
- Opt_check_integrity,
- Opt_check_integrity_including_extent_data,
- Opt_check_integrity_print_mask,
Opt_enospc_debug, Opt_noenospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
@@ -200,9 +198,6 @@ static const match_table_t tokens = {
{Opt_recovery, "recovery"},
/* Debugging options */
- {Opt_check_integrity, "check_int"},
- {Opt_check_integrity_including_extent_data, "check_int_data"},
- {Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
{Opt_enospc_debug, "enospc_debug"},
{Opt_noenospc_debug, "noenospc_debug"},
#ifdef CONFIG_BTRFS_DEBUG
@@ -707,44 +702,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_skip_balance:
btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
break;
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- case Opt_check_integrity_including_extent_data:
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info,
- "enabling check integrity including extent data");
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
- break;
- case Opt_check_integrity:
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info, "enabling check integrity");
- btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
- break;
- case Opt_check_integrity_print_mask:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info,
- "unrecognized check_integrity_print_mask value %s",
- args[0].from);
- goto out;
- }
- info->check_integrity_print_mask = intarg;
- btrfs_warn(info,
- "integrity checker is deprecated and will be removed in 6.7");
- btrfs_info(info, "check_integrity_print_mask 0x%x",
- info->check_integrity_print_mask);
- break;
-#else
- case Opt_check_integrity_including_extent_data:
- case Opt_check_integrity:
- case Opt_check_integrity_print_mask:
- btrfs_err(info,
- "support for check_integrity* not compiled in!");
- ret = -EINVAL;
- goto out;
-#endif
case Opt_fatal_errors:
if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
@@ -889,7 +846,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
error = -ENOMEM;
goto out;
}
- device = btrfs_scan_one_device(device_name, flags);
+ device = btrfs_scan_one_device(device_name, flags, false);
kfree(device_name);
if (IS_ERR(device)) {
error = PTR_ERR(device);
@@ -1305,15 +1262,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
- seq_puts(seq, ",check_int_data");
- else if (btrfs_test_opt(info, CHECK_INTEGRITY))
- seq_puts(seq, ",check_int");
- if (info->check_integrity_print_mask)
- seq_printf(seq, ",check_int_print_mask=%d",
- info->check_integrity_print_mask);
-#endif
if (info->metadata_ratio)
seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
@@ -1484,7 +1432,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
goto error_fs_info;
}
- device = btrfs_scan_one_device(device_name, mode);
+ /*
+ * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+ * either a valid device or an error.
+ */
+ device = btrfs_scan_one_device(device_name, mode, true);
+ ASSERT(device != NULL);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
error = PTR_ERR(device);
@@ -2117,7 +2070,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* calculated f_bavail.
*/
if (!mixed && block_rsv->space_info->full &&
- total_free_meta - thresh < block_rsv->size)
+ (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size))
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
@@ -2150,7 +2103,7 @@ static struct file_system_type btrfs_fs_type = {
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
static struct file_system_type btrfs_root_fs_type = {
@@ -2158,8 +2111,7 @@ static struct file_system_type btrfs_root_fs_type = {
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA |
- FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("btrfs");
@@ -2197,7 +2149,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
+ /*
+ * Scanning outside of mount can return NULL which would turn
+ * into 0 error code.
+ */
+ device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
@@ -2211,8 +2167,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ);
- if (IS_ERR(device)) {
+ /*
+ * Scanning outside of mount can return NULL which would turn
+ * into 0 error code.
+ */
+ device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
ret = PTR_ERR(device);
break;
@@ -2257,6 +2217,7 @@ static int check_dev_super(struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
struct btrfs_super_block *sb;
+ u64 last_trans;
u16 csum_type;
int ret = 0;
@@ -2292,10 +2253,10 @@ static int check_dev_super(struct btrfs_device *dev)
if (ret < 0)
goto out;
- if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ last_trans = btrfs_get_last_trans_committed(fs_info);
+ if (btrfs_super_generation(sb) != last_trans) {
btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
- btrfs_super_generation(sb),
- fs_info->last_trans_committed);
+ btrfs_super_generation(sb), last_trans);
ret = -EUCLEAN;
goto out;
}
@@ -2405,9 +2366,6 @@ static int __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
-#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- ", integrity-checker=on"
-#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b1d1ac25237b..e6b51fb3ddc1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
#ifdef CONFIG_BTRFS_DEBUG
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
+/* Remove once support for raid stripe tree is feature complete. */
+BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
BTRFS_FEAT_ATTR_PTR(block_group_tree),
+ BTRFS_FEAT_ATTR_PTR(simple_quota),
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
+ BTRFS_FEAT_ATTR_PTR(raid_stripe_tree),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
@@ -420,6 +425,13 @@ static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *bu
}
BTRFS_ATTR(static_feature, acl, acl_show);
+static ssize_t temp_fsid_supported_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ return sysfs_emit(buf, "0\n");
+}
+BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show);
+
/*
* Features which only depend on kernel version.
*
@@ -433,6 +445,7 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, send_stream_version),
BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
+ BTRFS_ATTR_PTR(static_feature, temp_fsid),
NULL
};
@@ -1196,10 +1209,19 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return sysfs_emit(buf, "%llu\n", fs_info->generation);
+ return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info));
}
BTRFS_ATTR(, generation, btrfs_generation_show);
+static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid);
+}
+BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
+
static const char * const btrfs_read_policy_name[] = { "pid" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1302,6 +1324,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, read_policy),
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
+ BTRFS_ATTR_PTR(, temp_fsid),
NULL,
};
@@ -2086,6 +2109,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
}
BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ ssize_t ret = 0;
+
+ spin_lock(&fs_info->qgroup_lock);
+ ASSERT(btrfs_qgroup_enabled(fs_info));
+ switch (btrfs_qgroup_mode(fs_info)) {
+ case BTRFS_QGROUP_MODE_FULL:
+ ret = sysfs_emit(buf, "qgroup\n");
+ break;
+ case BTRFS_QGROUP_MODE_SIMPLE:
+ ret = sysfs_emit(buf, "squota\n");
+ break;
+ default:
+ btrfs_warn(fs_info, "unexpected qgroup mode %d\n",
+ btrfs_qgroup_mode(fs_info));
+ break;
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return ret;
+}
+BTRFS_ATTR(qgroups, mode, qgroup_mode_show);
+
static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
struct kobj_attribute *a,
char *buf)
@@ -2148,6 +2198,7 @@ static struct attribute *qgroups_attrs[] = {
BTRFS_ATTR_PTR(qgroups, enabled),
BTRFS_ATTR_PTR(qgroups, inconsistent),
BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ BTRFS_ATTR_PTR(qgroups, mode),
NULL
};
ATTRIBUTE_GROUPS(qgroups);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 5ef0b90e25c3..6a43a64ba55a 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -61,7 +61,11 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- btrfs_setup_item_for_insert(root, path, &key, value_len);
+ /*
+ * Passing a NULL trans handle is fine here, we have a dummy root eb
+ * and the tree is a single node (level 0).
+ */
+ btrfs_setup_item_for_insert(NULL, root, path, &key, value_len);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 05b03f5eab83..492d69d2fa73 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -34,7 +34,11 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- btrfs_setup_item_for_insert(root, &path, &key, value_len);
+ /*
+ * Passing a NULL trans handle is fine here, we have a dummy root eb
+ * and the tree is a single node (level 0).
+ */
+ btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,7 +68,11 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- btrfs_setup_item_for_insert(root, &path, &key, value_len);
+ /*
+ * Passing a NULL trans handle is fine here, we have a dummy root eb
+ * and the tree is a single node (level 0).
+ */
+ btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 874e4394df86..6e63816dddcb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -56,12 +56,17 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
* | Call btrfs_commit_transaction() on any trans handle attached to
* | transaction N
* V
- * Transaction N [[TRANS_STATE_COMMIT_START]]
+ * Transaction N [[TRANS_STATE_COMMIT_PREP]]
+ * |
+ * | If there are simultaneous calls to btrfs_commit_transaction() one will win
+ * | the race and the rest will wait for the winner to commit the transaction.
* |
- * | Will wait for previous running transaction to completely finish if there
- * | is one
+ * | The winner will wait for previous running transaction to completely finish
+ * | if there is one.
* |
- * | Then one of the following happes:
+ * Transaction N [[TRANS_STATE_COMMIT_START]]
+ * |
+ * | Then one of the following happens:
* | - Wait for all other trans handle holders to release.
* | The btrfs_commit_transaction() caller will do the commit work.
* | - Wait for current transaction to be committed by others.
@@ -112,6 +117,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep;
*/
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
+ [TRANS_STATE_COMMIT_PREP] = 0U,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
@@ -380,7 +386,7 @@ loop:
IO_TREE_TRANS_DIRTY_PAGES);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS);
- fs_info->generation++;
+ btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
@@ -555,6 +561,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
return true;
}
+static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
+ enum btrfs_reserve_flush_enum flush,
+ u64 num_bytes,
+ u64 *delayed_refs_bytes)
+{
+ struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
+ u64 extra_delayed_refs_bytes = 0;
+ u64 bytes;
+ int ret;
+
+ /*
+ * If there's a gap between the size of the delayed refs reserve and
+ * its reserved space, than some tasks have added delayed refs or bumped
+ * its size otherwise (due to block group creation or removal, or block
+ * group item update). Also try to allocate that gap in order to prevent
+ * using (and possibly abusing) the global reserve when committing the
+ * transaction.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
+ spin_lock(&delayed_refs_rsv->lock);
+ if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
+ extra_delayed_refs_bytes = delayed_refs_rsv->size -
+ delayed_refs_rsv->reserved;
+ spin_unlock(&delayed_refs_rsv->lock);
+ }
+
+ bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
+
+ /*
+ * We want to reserve all the bytes we may need all at once, so we only
+ * do 1 enospc flushing cycle per transaction start.
+ */
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ if (ret == 0) {
+ if (extra_delayed_refs_bytes > 0)
+ btrfs_migrate_to_delayed_refs_rsv(fs_info,
+ extra_delayed_refs_bytes);
+ return 0;
+ }
+
+ if (extra_delayed_refs_bytes > 0) {
+ bytes -= extra_delayed_refs_bytes;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ if (ret == 0)
+ return 0;
+ }
+
+ /*
+ * If we are an emergency flush, which can steal from the global block
+ * reserve, then attempt to not reserve space for the delayed refs, as
+ * we will consume space for them from the global block reserve.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ bytes -= *delayed_refs_bytes;
+ *delayed_refs_bytes = 0;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+ }
+
+ return ret;
+}
+
static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, unsigned int num_items,
unsigned int type, enum btrfs_reserve_flush_enum flush,
@@ -562,10 +631,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
u64 qgroup_reserved = 0;
+ u64 delayed_refs_bytes = 0;
bool reloc_reserved = false;
bool do_chunk_alloc = false;
int ret;
@@ -588,9 +659,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* the appropriate flushing if need be.
*/
if (num_items && root != fs_info->chunk_root) {
- struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
- u64 delayed_refs_bytes = 0;
-
qgroup_reserved = num_items * fs_info->nodesize;
/*
* Use prealloc for now, as there might be a currently running
@@ -602,20 +670,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (ret)
return ERR_PTR(ret);
+ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
/*
- * We want to reserve all the bytes we may need all at once, so
- * we only do 1 enospc flushing cycle per transaction start. We
- * accomplish this by simply assuming we'll do num_items worth
- * of delayed refs updates in this trans handle, and refill that
- * amount for whatever is missing in the reserve.
+ * If we plan to insert/update/delete "num_items" from a btree,
+ * we will also generate delayed refs for extent buffers in the
+ * respective btree paths, so reserve space for the delayed refs
+ * that will be generated by the caller as it modifies btrees.
+ * Try to reserve them to avoid excessive use of the global
+ * block reserve.
*/
- num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
- num_items);
- num_bytes += delayed_refs_bytes;
- }
+ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);
/*
* Do the reservation for the relocation root creation
@@ -625,16 +689,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
reloc_reserved = true;
}
- ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
+ ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
+ &delayed_refs_bytes);
if (ret)
goto reserve_fail;
- if (delayed_refs_bytes) {
- btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
- delayed_refs_bytes);
- num_bytes -= delayed_refs_bytes;
- }
- if (rsv->space_info->force_alloc)
+ btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);
+
+ if (trans_rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
!btrfs_block_rsv_full(delayed_refs_rsv)) {
@@ -694,6 +756,7 @@ again:
h->type = type;
INIT_LIST_HEAD(&h->new_bgs);
+ btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
@@ -706,8 +769,17 @@ again:
if (num_bytes) {
trace_btrfs_space_reservation(fs_info, "transaction",
h->transid, num_bytes, 1);
- h->block_rsv = &fs_info->trans_block_rsv;
+ h->block_rsv = trans_rsv;
h->bytes_reserved = num_bytes;
+ if (delayed_refs_bytes > 0) {
+ trace_btrfs_space_reservation(fs_info,
+ "local_delayed_refs_rsv",
+ h->transid,
+ delayed_refs_bytes, 1);
+ h->delayed_refs_bytes_reserved = delayed_refs_bytes;
+ btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
+ delayed_refs_bytes = 0;
+ }
h->reloc_reserved = reloc_reserved;
}
@@ -763,8 +835,10 @@ join_fail:
kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
if (num_bytes)
- btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
- num_bytes, NULL);
+ btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
+ if (delayed_refs_bytes)
+ btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
+ delayed_refs_bytes);
reserve_fail:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -811,7 +885,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo
}
/*
- * btrfs_attach_transaction() - catch the running transaction
+ * Catch the running transaction.
*
* It is used when we want to commit the current the transaction, but
* don't want to start a new one.
@@ -830,7 +904,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
}
/*
- * btrfs_attach_transaction_barrier() - catch the running transaction
+ * Catch the running transaction.
*
* It is similar to the above function, the difference is this one
* will wait for all the inactive transactions until they fully
@@ -906,7 +980,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
int ret = 0;
if (transid) {
- if (transid <= fs_info->last_trans_committed)
+ if (transid <= btrfs_get_last_trans_committed(fs_info))
goto out;
/* find specified transaction */
@@ -930,7 +1004,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
* raced with btrfs_commit_transaction
*/
if (!cur_trans) {
- if (transid > fs_info->last_trans_committed)
+ if (transid > btrfs_get_last_trans_committed(fs_info))
ret = -EINVAL;
goto out;
}
@@ -985,11 +1059,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
if (!trans->block_rsv) {
ASSERT(!trans->bytes_reserved);
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
}
- if (!trans->bytes_reserved)
+ if (!trans->bytes_reserved) {
+ ASSERT(!trans->delayed_refs_bytes_reserved);
return;
+ }
ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
trace_btrfs_space_reservation(fs_info, "transaction",
@@ -997,6 +1074,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
+
+ if (!trans->delayed_refs_bytes_reserved)
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
+ trans->transid,
+ trans->delayed_refs_bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
+ trans->delayed_refs_bytes_reserved, NULL);
+ trans->delayed_refs_bytes_reserved = 0;
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -1328,7 +1415,7 @@ again:
}
/* Now flush any delayed refs generated by updating all of the roots */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
@@ -1343,7 +1430,7 @@ again:
* so we want to keep this flushing in this loop to make sure
* everything gets run.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
return ret;
}
@@ -1478,45 +1565,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
}
/*
- * defrag a given btree.
- * Every leaf in the btree is read and defragged.
- */
-int btrfs_defrag_root(struct btrfs_root *root)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_trans_handle *trans;
- int ret;
-
- if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
- return 0;
-
- while (1) {
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
-
- ret = btrfs_defrag_leaves(trans, root);
-
- btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(info);
- cond_resched();
-
- if (btrfs_fs_closing(info) || ret != -EAGAIN)
- break;
-
- if (btrfs_defrag_cancelled(info)) {
- btrfs_debug(info, "defrag_root cancelled");
- ret = -EAGAIN;
- break;
- }
- }
- clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
- return ret;
-}
-
-/*
* Do all special snapshot related qgroup dirty hack.
*
* Will do all needed qgroup inherit and dirty hack like switch commit
@@ -1533,11 +1581,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
int ret;
/*
- * Save some performance in the case that qgroups are not
- * enabled. If this check races with the ioctl, rescan will
- * kick in anyway.
+ * Save some performance in the case that qgroups are not enabled. If
+ * this check races with the ioctl, rescan will kick in anyway.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/*
@@ -1561,7 +1608,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* for now flush the delayed refs to narrow the race window where the
* qgroup counters could end up wrong.
*/
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -1576,7 +1623,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
/* Now qgroup are all updated, we can inherit it to new qgroups */
ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
- inherit);
+ parent->root_key.objectid, inherit);
if (ret < 0)
goto out;
@@ -1726,6 +1773,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
+ ret = btrfs_create_qgroup(trans, objectid);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
+
/*
* pull in the delayed directory update
* and the delayed inode item
@@ -1837,8 +1890,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* To co-operate with that hack, we do hack again.
* Or snapshot will be greatly slowed down by a subtree qgroup rescan
*/
- ret = qgroup_account_snapshot(trans, root, parent_root,
- pending->inherit, objectid);
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
+ ret = qgroup_account_snapshot(trans, root, parent_root,
+ pending->inherit, objectid);
+ else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
+ ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid,
+ parent_root->root_key.objectid, pending->inherit);
if (ret < 0)
goto fail;
@@ -1854,8 +1911,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
fname.disk_name.len * 2);
- parent_inode->i_mtime = inode_set_ctime_current(parent_inode);
- ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
+ inode_set_mtime_to_ts(parent_inode,
+ inode_set_ctime_current(parent_inode));
+ ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1982,7 +2040,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
- btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
@@ -2078,7 +2136,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
- btrfs_delayed_refs_rsv_release(fs_info, 1);
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
}
}
@@ -2129,7 +2187,7 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
return;
lockdep_assert_held(&trans->fs_info->trans_lock);
- ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP);
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
@@ -2153,7 +2211,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
- btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
@@ -2213,7 +2271,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
spin_lock(&fs_info->trans_lock);
- if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
add_pending_snapshot(trans);
@@ -2225,7 +2283,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
want_state = TRANS_STATE_SUPER_COMMITTED;
btrfs_trans_state_lockdep_release(fs_info,
- BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans, want_state);
@@ -2237,9 +2295,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- cur_trans->state = TRANS_STATE_COMMIT_START;
+ cur_trans->state = TRANS_STATE_COMMIT_PREP;
wake_up(&fs_info->transaction_blocked_wait);
- btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
if (cur_trans->list.prev != &fs_info->trans_list) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
@@ -2260,11 +2318,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_put_transaction(prev_trans);
if (ret)
goto lockdep_release;
- } else {
- spin_unlock(&fs_info->trans_lock);
+ spin_lock(&fs_info->trans_lock);
}
} else {
- spin_unlock(&fs_info->trans_lock);
/*
* The previous transaction was aborted and was already removed
* from the list of transactions at fs_info->trans_list. So we
@@ -2272,11 +2328,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
if (BTRFS_FS_ERROR(fs_info)) {
+ spin_unlock(&fs_info->trans_lock);
ret = -EROFS;
goto lockdep_release;
}
}
+ cur_trans->state = TRANS_STATE_COMMIT_START;
+ wake_up(&fs_info->transaction_blocked_wait);
+ spin_unlock(&fs_info->trans_lock);
+
/*
* Get the time spent on the work done by the commit thread and not
* the time spent waiting on a previous commit
@@ -2394,7 +2455,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
if (ret)
goto unlock_reloc;
@@ -2527,7 +2588,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
- fs_info->last_trans_committed = cur_trans->transid;
+ btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2586,7 +2647,7 @@ lockdep_release:
goto cleanup_transaction;
lockdep_trans_commit_start_release:
- btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
btrfs_end_transaction(trans);
return ret;
}
@@ -2645,18 +2706,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
*/
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno, bool first_hit)
+ unsigned int line, int error, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- WRITE_ONCE(trans->aborted, errno);
- WRITE_ONCE(trans->transaction->aborted, errno);
- if (first_hit && errno == -ENOSPC)
+ WRITE_ONCE(trans->aborted, error);
+ WRITE_ONCE(trans->transaction->aborted, error);
+ if (first_hit && error == -ENOSPC)
btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
+ __btrfs_handle_fs_error(fs_info, function, line, error, NULL);
}
int __init btrfs_transaction_init(void)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 8e9fa23bd7fe..18c4f6e83b78 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -14,6 +14,7 @@
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
+ TRANS_STATE_COMMIT_PREP,
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
@@ -117,8 +118,10 @@ enum {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 delayed_refs_bytes_reserved;
u64 chunk_bytes_reserved;
unsigned long delayed_ref_updates;
+ unsigned long delayed_ref_csum_deletions;
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
@@ -138,6 +141,7 @@ struct btrfs_trans_handle {
bool in_fsync;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
+ struct btrfs_block_rsv delayed_rsv;
};
/*
@@ -171,7 +175,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
{
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
- inode->last_sub_trans = inode->root->log_transid;
+ inode->last_sub_trans = btrfs_get_root_log_transid(inode->root);
inode->last_log_commit = inode->last_sub_trans - 1;
spin_unlock(&inode->lock);
}
@@ -199,32 +203,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
-bool __cold abort_should_print_stack(int errno);
+bool __cold abort_should_print_stack(int error);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact stack trace is reported for some errors.
*/
-#define btrfs_abort_transaction(trans, errno) \
+#define btrfs_abort_transaction(trans, error) \
do { \
bool first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
first = true; \
- if (WARN(abort_should_print_stack(errno), \
+ if (WARN(abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
- (errno))) { \
+ (error))) { \
/* Stack trace printed. */ \
} else { \
- btrfs_debug((trans)->fs_info, \
- "Transaction aborted (error %d)", \
- (errno)); \
+ btrfs_err((trans)->fs_info, \
+ "Transaction aborted (error %d)", \
+ (error)); \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno), first); \
+ __LINE__, (error), first); \
} while (0)
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -242,7 +246,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root);
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
@@ -263,7 +266,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno, bool first_hit);
+ unsigned int line, int error, bool first_hit);
int __init btrfs_transaction_init(void);
void __cold btrfs_transaction_exit(void);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ab08a0b01311..a416cbea75d1 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -29,6 +29,8 @@
#include "accessors.h"
#include "file-item.h"
#include "inode-item.h"
+#include "dir-item.h"
+#include "raid-stripe-tree.h"
/*
* Error message should follow the following format:
@@ -1465,6 +1467,9 @@ static int check_extent_item(struct extent_buffer *leaf,
}
inline_refs += btrfs_shared_data_ref_count(leaf, sref);
break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+ WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ break;
default:
extent_err(leaf, slot, "unknown inline ref type: %u",
inline_type);
@@ -1631,6 +1636,44 @@ static int check_inode_ref(struct extent_buffer *leaf,
return 0;
}
+static int check_raid_stripe_extent(const struct extent_buffer *leaf,
+ const struct btrfs_key *key, int slot)
+{
+ struct btrfs_stripe_extent *stripe_extent =
+ btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
+ generic_err(leaf, slot,
+"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
+ key->objectid, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
+
+ if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) {
+ generic_err(leaf, slot,
+ "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset");
+ return -EUCLEAN;
+ }
+
+ switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) {
+ case BTRFS_STRIPE_RAID0:
+ case BTRFS_STRIPE_RAID1:
+ case BTRFS_STRIPE_DUP:
+ case BTRFS_STRIPE_RAID10:
+ case BTRFS_STRIPE_RAID5:
+ case BTRFS_STRIPE_RAID6:
+ case BTRFS_STRIPE_RAID1C3:
+ case BTRFS_STRIPE_RAID1C4:
+ break;
+ default:
+ generic_err(leaf, slot, "invalid raid stripe encoding %u",
+ btrfs_stripe_extent_encoding(leaf, stripe_extent));
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
/*
* Common point to switch the item-specific validation.
*/
@@ -1685,6 +1728,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
ret = check_extent_data_ref(leaf, key, slot);
break;
+ case BTRFS_RAID_STRIPE_KEY:
+ ret = check_raid_stripe_extent(leaf, key, slot);
+ break;
}
if (ret)
@@ -2005,7 +2051,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* So we only checks tree blocks which is read from disk, whose
* generation <= fs_info->last_trans_committed.
*/
- if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
+ if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info))
return 0;
/* We have @first_key, so this @eb must have at least one item */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d1e46b839519..7d6729d9fd2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -347,8 +347,7 @@ static int process_one_buffer(struct btrfs_root *log,
}
if (wc->pin) {
- ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
- eb->len);
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
if (ret)
return ret;
@@ -504,9 +503,9 @@ insert:
found_size = btrfs_item_size(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
- btrfs_truncate_item(path, item_size, 1);
+ btrfs_truncate_item(trans, path, item_size, 1);
else if (found_size < item_size)
- btrfs_extend_item(path, item_size - found_size);
+ btrfs_extend_item(trans, path, item_size - found_size);
} else if (ret) {
return ret;
}
@@ -574,7 +573,7 @@ insert:
}
}
no_copy:
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
return 0;
}
@@ -767,7 +766,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
} else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
- ins.objectid, ins.offset, 0);
+ ins.objectid, ins.offset, 0,
+ root->root_key.objectid);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key->objectid, offset, 0, false);
@@ -890,7 +890,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
out:
iput(inode);
return ret;
@@ -1445,7 +1445,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
goto out;
}
@@ -1483,8 +1483,7 @@ out:
return ret;
}
-static int count_inode_extrefs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
@@ -1498,8 +1497,8 @@ static int count_inode_extrefs(struct btrfs_root *root,
struct extent_buffer *leaf;
while (1) {
- ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
- &extref, &offset);
+ ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
+ path, &extref, &offset);
if (ret)
break;
@@ -1527,8 +1526,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
return nlink;
}
-static int count_inode_refs(struct btrfs_root *root,
- struct btrfs_inode *inode, struct btrfs_path *path)
+static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
@@ -1543,7 +1541,7 @@ static int count_inode_refs(struct btrfs_root *root,
key.offset = (u64)-1;
while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
@@ -1595,9 +1593,9 @@ process_slot:
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct inode *inode)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int ret;
u64 nlink = 0;
@@ -1607,13 +1605,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = count_inode_refs(root, BTRFS_I(inode), path);
+ ret = count_inode_refs(BTRFS_I(inode), path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(root, BTRFS_I(inode), path);
+ ret = count_inode_extrefs(BTRFS_I(inode), path);
if (ret < 0)
goto out;
@@ -1623,7 +1621,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
goto out;
}
@@ -1685,7 +1683,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
}
- ret = fixup_inode_link_count(trans, root, inode);
+ ret = fixup_inode_link_count(trans, inode);
iput(inode);
if (ret)
break;
@@ -1732,7 +1730,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
else
inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
}
@@ -1939,7 +1937,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ ret = btrfs_update_inode(trans, BTRFS_I(dir));
}
kfree(name.name);
iput(dir);
@@ -2483,7 +2481,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
drop_args.bytes_found);
/* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
- root, BTRFS_I(inode));
+ BTRFS_I(inode));
}
iput(inode);
if (ret)
@@ -2574,7 +2572,7 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(eb);
if (trans) {
- ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
+ ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
return ret;
btrfs_redirty_list_add(trans->transaction, eb);
@@ -2848,10 +2846,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
}
/*
- * btrfs_sync_log does sends a given tree log down to the disk and
- * updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk only
- * if it returns 0.
+ * Sends a given tree log down to the disk and updates the super blocks to
+ * record it. When this call is done, you know that any inodes previously
+ * logged are safely on disk only if it returns 0.
*
* Any other return value means you need to call btrfs_commit_transaction.
* Some of the edge cases for fsyncing directories that have had unlinks
@@ -2961,7 +2958,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&log->root_item, log->node);
memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
- root->log_transid++;
+ btrfs_set_root_log_transid(root, root->log_transid + 1);
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
@@ -2999,9 +2996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = update_log_root(trans, log, &new_root_item);
if (ret) {
- if (!list_empty(&root_log_ctx.list))
- list_del_init(&root_log_ctx.list);
-
+ list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
if (ret != -ENOSPC)
@@ -3021,7 +3016,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
@@ -3136,8 +3130,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* someone else already started it. We use <= and not < because the
* first log transaction has an ID of 0.
*/
- ASSERT(root->last_log_commit <= log_transid);
- root->last_log_commit = log_transid;
+ ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
+ btrfs_set_root_last_log_commit(root, log_transid);
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
@@ -3211,8 +3205,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
}
}
- clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
- EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
+ extent_io_tree_release(&log->dirty_log_pages);
extent_io_tree_release(&log->log_csum_range);
btrfs_put_root(log);
@@ -3530,7 +3523,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
last_offset = max(last_offset, curr_end);
}
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
- btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
btrfs_release_path(path);
return 0;
}
@@ -4138,19 +4131,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
- inode->i_atime.tv_sec);
+ inode_get_atime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode->i_atime.tv_nsec);
+ inode_get_atime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode->i_mtime.tv_sec);
+ inode_get_mtime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode->i_mtime.tv_nsec);
+ inode_get_mtime_nsec(inode));
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime(inode).tv_nsec);
+ inode_get_ctime_nsec(inode));
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4488,7 +4481,7 @@ copy_item:
dst_index++;
}
- btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+ btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
btrfs_release_path(dst_path);
out:
kfree(ins_data);
@@ -4693,7 +4686,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, &fi,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(fi));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -4722,7 +4715,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
int ins_nr = 0;
- int start_slot;
+ int start_slot = 0;
int ret;
if (!(inode->flags & BTRFS_INODE_PREALLOC))
@@ -4921,12 +4914,12 @@ process:
set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
- spin_lock_irq(&inode->ordered_tree.lock);
+ spin_lock_irq(&inode->ordered_tree_lock);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
atomic_inc(&trans->transaction->pending_ordered);
}
- spin_unlock_irq(&inode->ordered_tree.lock);
+ spin_unlock_irq(&inode->ordered_tree_lock);
}
btrfs_put_ordered_extent(ordered);
}
@@ -7204,9 +7197,7 @@ again:
* each subsequent pass.
*/
if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans,
- log->node->start,
- log->node->len);
+ ret = btrfs_pin_extent_for_log_replay(trans, log->node);
btrfs_put_root(log);
if (!ret)
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 33606025513d..b4ac2b0cd235 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -223,7 +223,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
}
/*
- * ulist_del - delete one node from ulist
+ * Delete one node from ulist.
+ *
* @ulist: ulist to remove node from
* @val: value to delete
* @aux: aux to delete
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 7c7001f42b14..5be74f9e47eb 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -124,7 +124,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
* An item with that type already exists.
* Extend the item and store the new subid at the end.
*/
- btrfs_extend_item(path, sizeof(subid_le));
+ btrfs_extend_item(trans, path, sizeof(subid_le));
eb = path->nodes[0];
slot = path->slots[0];
offset = btrfs_item_ptr_offset(eb, slot);
@@ -139,7 +139,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
ret = 0;
subid_le = cpu_to_le64(subid_cpu);
write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
out:
btrfs_free_path(path);
@@ -221,7 +221,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
move_src = offset + sizeof(subid);
move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
memmove_extent_buffer(eb, move_dst, move_src, move_len);
- btrfs_truncate_item(path, item_size - sizeof(subid), 1);
+ btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1);
out:
btrfs_free_path(path);
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index c5ff16f9e9fa..66e2270b0dae 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -487,7 +487,7 @@ static int rollback_verity(struct btrfs_inode *inode)
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -554,7 +554,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
ret = del_orphan(trans, inode);
@@ -715,7 +715,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- struct page *page;
+ struct folio *folio;
u64 off = (u64)index << PAGE_SHIFT;
loff_t merkle_pos = merkle_file_pos(inode);
int ret;
@@ -726,29 +726,36 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
return ERR_PTR(-EFBIG);
index += merkle_pos >> PAGE_SHIFT;
again:
- page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
- if (page) {
- if (PageUptodate(page))
- return page;
+ folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+ if (!IS_ERR(folio)) {
+ if (folio_test_uptodate(folio))
+ goto out;
- lock_page(page);
- /*
- * We only insert uptodate pages, so !Uptodate has to be
- * an error
- */
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ /* If it's not uptodate after we have the lock, we got a read error. */
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-EIO);
}
- unlock_page(page);
- return page;
+ folio_unlock(folio);
+ goto out;
}
- page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
- if (!page)
+ folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
+ 0);
+ if (!folio)
return ERR_PTR(-ENOMEM);
+ ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS);
+ if (ret) {
+ folio_put(folio);
+ /* Did someone else insert a folio here? */
+ if (ret == -EEXIST)
+ goto again;
+ return ERR_PTR(ret);
+ }
+
/*
* Merkle item keys are indexed from byte 0 in the merkle tree.
* They have the form:
@@ -756,28 +763,19 @@ again:
* [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
*/
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
- page_address(page), PAGE_SIZE, page);
+ folio_address(folio), PAGE_SIZE, &folio->page);
if (ret < 0) {
- put_page(page);
+ folio_put(folio);
return ERR_PTR(ret);
}
if (ret < PAGE_SIZE)
- memzero_page(page, ret, PAGE_SIZE - ret);
+ folio_zero_segment(folio, ret, PAGE_SIZE);
- SetPageUptodate(page);
- ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
- if (!ret) {
- /* Inserted and ready for fsverity */
- unlock_page(page);
- } else {
- put_page(page);
- /* Did someone race us into inserting this page? */
- if (ret == -EEXIST)
- goto again;
- page = ERR_PTR(ret);
- }
- return page;
+out:
+ return folio_file_page(folio, index);
}
/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9621455edebc..c87e18827a0a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -35,6 +35,7 @@
#include "relocation.h"
#include "scrub.h"
#include "super.h"
+#include "raid-stripe-tree.h"
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
@@ -357,21 +358,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
}
/*
- * alloc_fs_devices - allocate struct btrfs_fs_devices
- * @fsid: if not NULL, copy the UUID to fs_devices::fsid
- * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
+ * Allocate new btrfs_fs_devices structure identified by a fsid.
+ *
+ * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to
+ * fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
-static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
- const u8 *metadata_fsid)
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
struct btrfs_fs_devices *fs_devs;
- ASSERT(fsid || !metadata_fsid);
-
fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
@@ -385,8 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
if (fsid) {
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
- memcpy(fs_devs->metadata_uuid,
- metadata_fsid ?: fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
}
return fs_devs;
@@ -457,91 +455,41 @@ static noinline struct btrfs_fs_devices *find_fsid(
return NULL;
}
-/*
- * First check if the metadata_uuid is different from the fsid in the given
- * fs_devices. Then check if the given fsid is the same as the metadata_uuid
- * in the fs_devices. If it is, return true; otherwise, return false.
- */
-static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
-{
- return memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) != 0 &&
- memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0;
-}
-
-static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
- struct btrfs_super_block *disk_super)
-{
-
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by first scanning
- * a device which didn't have its fsid/metadata_uuid changed
- * at all and the CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid,
- fs_devices->fsid))
- return fs_devices;
- }
-
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by a device that
- * has an outdated pair of fsid/metadata_uuid and
- * CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->metadata_uuid))
- return fs_devices;
- }
-
- return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
-}
-
-
static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
- int flush, struct block_device **bdev,
+ int flush, struct bdev_handle **bdev_handle,
struct btrfs_super_block **disk_super)
{
+ struct block_device *bdev;
int ret;
- *bdev = blkdev_get_by_path(device_path, flags, holder, NULL);
+ *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
- if (IS_ERR(*bdev)) {
- ret = PTR_ERR(*bdev);
+ if (IS_ERR(*bdev_handle)) {
+ ret = PTR_ERR(*bdev_handle);
goto error;
}
+ bdev = (*bdev_handle)->bdev;
if (flush)
- sync_blockdev(*bdev);
- ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
+ sync_blockdev(bdev);
+ ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- blkdev_put(*bdev, holder);
+ bdev_release(*bdev_handle);
goto error;
}
- invalidate_bdev(*bdev);
- *disk_super = btrfs_read_dev_super(*bdev);
+ invalidate_bdev(bdev);
+ *disk_super = btrfs_read_dev_super(bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- blkdev_put(*bdev, holder);
+ bdev_release(*bdev_handle);
goto error;
}
return 0;
error:
- *bdev = NULL;
+ *bdev_handle = NULL;
return ret;
}
@@ -562,13 +510,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
- int ret = 0;
+ int ret;
+ bool freed = false;
lockdep_assert_held(&uuid_mutex);
- if (devt)
- ret = -ENOENT;
-
+ /* Return good status if there is no instance of devt. */
+ ret = 0;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
mutex_lock(&fs_devices->device_list_mutex);
@@ -579,8 +527,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
if (devt && devt != device->devt)
continue;
if (fs_devices->opened) {
- /* for an already deleted device return 0 */
- if (devt && ret != 0)
+ if (devt)
ret = -EBUSY;
break;
}
@@ -590,7 +537,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
list_del(&device->dev_list);
btrfs_free_device(device);
- ret = 0;
+ freed = true;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -601,9 +548,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
}
}
+ /* If there is at least one freed device return 0. */
+ if (freed)
+ return 0;
+
return ret;
}
+static struct btrfs_fs_devices *find_fsid_by_device(
+ struct btrfs_super_block *disk_super,
+ dev_t devt, bool *same_fsid_diff_dev)
+{
+ struct btrfs_fs_devices *fsid_fs_devices;
+ struct btrfs_fs_devices *devt_fs_devices;
+ const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+ bool found_by_devt = false;
+
+ /* Find the fs_device by the usual method, if found use it. */
+ fsid_fs_devices = find_fsid(disk_super->fsid,
+ has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+
+ /* The temp_fsid feature is supported only with single device filesystem. */
+ if (btrfs_super_num_devices(disk_super) != 1)
+ return fsid_fs_devices;
+
+ /*
+ * A seed device is an integral component of the sprout device, which
+ * functions as a multi-device filesystem. So, temp-fsid feature is
+ * not supported.
+ */
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
+ return fsid_fs_devices;
+
+ /* Try to find a fs_devices by matching devt. */
+ list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
+ if (device->devt == devt) {
+ found_by_devt = true;
+ break;
+ }
+ }
+ if (found_by_devt)
+ break;
+ }
+
+ if (found_by_devt) {
+ /* Existing device. */
+ if (fsid_fs_devices == NULL) {
+ if (devt_fs_devices->opened == 0) {
+ /* Stale device. */
+ return NULL;
+ } else {
+ /* temp_fsid is mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* Regular or temp_fsid device mounting a subvol. */
+ return devt_fs_devices;
+ }
+ } else {
+ /* New device. */
+ if (fsid_fs_devices == NULL) {
+ return NULL;
+ } else {
+ /* sb::fsid is already used create a new temp_fsid. */
+ *same_fsid_diff_dev = true;
+ return NULL;
+ }
+ }
+
+ /* Not reached. */
+}
+
/*
* This is only used on mount, and we are protected from competing things
* messing with our fs_devices by the uuid_mutex, thus we do not need the
@@ -613,7 +632,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, blk_mode_t flags,
void *holder)
{
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -624,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &disk_super);
+ &bdev_handle, &disk_super);
if (ret)
return ret;
@@ -648,21 +667,21 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
- if (bdev_read_only(bdev))
+ if (bdev_read_only(bdev_handle->bdev))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!bdev_nonrot(bdev))
+ if (!bdev_nonrot(bdev_handle->bdev))
fs_devices->rotating = true;
- if (bdev_max_discard_sectors(bdev))
+ if (bdev_max_discard_sectors(bdev_handle->bdev))
fs_devices->discardable = true;
- device->bdev = bdev;
+ device->bdev_handle = bdev_handle;
+ device->bdev = bdev_handle->bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- device->holder = holder;
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
@@ -676,7 +695,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, holder);
+ bdev_release(bdev_handle);
return -EINVAL;
}
@@ -690,84 +709,6 @@ u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
}
/*
- * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change. Such
- * disk can belong to an fs which has its FSID changed or to one which doesn't.
- * Handle both cases here.
- */
-static struct btrfs_fs_devices *find_fsid_inprogress(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->fsid))
- return fs_devices;
- }
-
- return find_fsid(disk_super->fsid, NULL);
-}
-
-static struct btrfs_fs_devices *find_fsid_changed(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handles the case where scanned device is part of an fs that had
- * multiple successful changes of FSID but currently device didn't
- * observe it. Meaning our fsid will be different than theirs. We need
- * to handle two subcases :
- * 1 - The fs still continues to have different METADATA/FSID uuids.
- * 2 - The fs is switched back to its original FSID (METADATA/FSID
- * are equal).
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- /* Changed UUIDs */
- if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) &&
- memcmp(fs_devices->fsid, disk_super->fsid,
- BTRFS_FSID_SIZE) != 0)
- return fs_devices;
-
- /* Unchanged UUIDs */
- if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, disk_super->metadata_uuid,
- BTRFS_FSID_SIZE) == 0)
- return fs_devices;
- }
-
- return NULL;
-}
-
-static struct btrfs_fs_devices *find_fsid_reverted_metadata(
- struct btrfs_super_block *disk_super)
-{
- struct btrfs_fs_devices *fs_devices;
-
- /*
- * Handle the case where the scanned device is part of an fs whose last
- * metadata UUID change reverted it to the original FSID. At the same
- * time fs_devices was first created by another constituent device
- * which didn't fully observe the operation. This results in an
- * btrfs_fs_devices created with metadata/fsid different AND
- * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
- * fs_devices equal to the FSID of the disk.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (!fs_devices->fsid_change)
- continue;
-
- if (check_fsid_changed(fs_devices, disk_super->fsid))
- return fs_devices;
- }
-
- return NULL;
-}
-/*
* Add new device to list of registered devices
*
* Returns:
@@ -785,10 +726,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_t path_devt;
int error;
+ bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
- bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
- BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
+ btrfs_err(NULL,
+"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
+ path);
+ return ERR_PTR(-EAGAIN);
+ }
error = lookup_bdev(path, &path_devt);
if (error) {
@@ -797,27 +744,23 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(error);
}
- if (fsid_change_in_progress) {
- if (!has_metadata_uuid)
- fs_devices = find_fsid_inprogress(disk_super);
- else
- fs_devices = find_fsid_changed(disk_super);
- } else if (has_metadata_uuid) {
- fs_devices = find_fsid_with_metadata_uuid(disk_super);
- } else {
- fs_devices = find_fsid_reverted_metadata(disk_super);
- if (!fs_devices)
- fs_devices = find_fsid(disk_super->fsid, NULL);
- }
-
+ fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
if (!fs_devices) {
- fs_devices = alloc_fs_devices(disk_super->fsid,
- has_metadata_uuid ? disk_super->metadata_uuid : NULL);
+ fs_devices = alloc_fs_devices(disk_super->fsid);
+ if (has_metadata_uuid)
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
- fs_devices->fsid_change = fsid_change_in_progress;
+ if (same_fsid_diff_dev) {
+ generate_random_uuid(fs_devices->fsid);
+ fs_devices->temp_fsid = true;
+ pr_info("BTRFS: device %s using temp-fsid %pU\n",
+ path, fs_devices->fsid);
+ }
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
@@ -832,18 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, &args);
- /*
- * If this disk has been pulled into an fs devices created by
- * a device which had the CHANGING_FSID_V2 flag then replace the
- * metadata_uuid/fsid values of the fs_devices.
- */
- if (fs_devices->fsid_change &&
- found_transid > fs_devices->latest_generation) {
+ if (found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
memcpy(fs_devices->metadata_uuid,
btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
- fs_devices->fsid_change = false;
}
}
@@ -997,7 +933,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
lockdep_assert_held(&uuid_mutex);
- fs_devices = alloc_fs_devices(orig->fsid, NULL);
+ fs_devices = alloc_fs_devices(orig->fsid);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -1068,9 +1004,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
- if (device->bdev) {
- blkdev_put(device->bdev, device->holder);
+ if (device->bdev_handle) {
+ bdev_release(device->bdev_handle);
device->bdev = NULL;
+ device->bdev_handle = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1115,7 +1052,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- blkdev_put(device->bdev, device->holder);
+ bdev_release(device->bdev_handle);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1356,14 +1293,19 @@ int btrfs_forget_devices(dev_t devt)
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
- * is read via pagecache
+ * is read via pagecache.
+ *
+ * With @mount_arg_dev it's a scan during mount time that will always register
+ * the device or return an error. Multi-device and seeding devices are registered
+ * in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+ bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
u64 bytenr, bytenr_orig;
int ret;
@@ -1386,31 +1328,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags)
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev = blkdev_get_by_path(path, flags, NULL, NULL);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
+ if (IS_ERR(bdev_handle))
+ return ERR_CAST(bdev_handle);
bytenr_orig = btrfs_sb_offset(0);
- ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
- disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
+ disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+ bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
+ if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+ !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
+ dev_t devt;
+
+ ret = lookup_bdev(path, &devt);
+ if (ret)
+ btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
+ path, ret);
+ else
+ btrfs_free_stale_devices(devt, NULL);
+
+ pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
+ device = NULL;
+ goto free_disk_super;
+ }
+
device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
+free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- blkdev_put(bdev, NULL);
+ bdev_release(bdev_handle);
return device;
}
@@ -1594,7 +1554,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 search_start;
u64 hole_size;
u64 max_hole_start;
- u64 max_hole_size;
+ u64 max_hole_size = 0;
u64 extent_end;
u64 search_end = device->total_bytes;
int ret;
@@ -1602,17 +1562,16 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
struct extent_buffer *l;
search_start = dev_extent_search_start(device);
+ max_hole_start = search_start;
WARN_ON(device->zone_info &&
!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- max_hole_start = search_start;
- max_hole_size = 0;
-
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
again:
if (search_start >= search_end ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@ -1895,7 +1854,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
ret = 0;
out:
@@ -2088,7 +2047,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct block_device **bdev, void **holder)
+ struct bdev_handle **bdev_handle)
{
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
@@ -2197,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
btrfs_assign_next_active_device(device, NULL);
- if (device->bdev) {
+ if (device->bdev_handle) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
@@ -2213,9 +2172,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and blkdev_put() will pull in the ->open_mutex on the
- * block device and it's dependencies. Instead just flush the device
- * and let the caller do the final blkdev_put.
+ * write lock, and bdev_release() will pull in the ->open_mutex on
+ * the block device and it's dependencies. Instead just flush the
+ * device and let the caller do the final bdev_release.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_scratch_superblocks(fs_info, device->bdev,
@@ -2226,8 +2185,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
}
- *bdev = device->bdev;
- *holder = device->holder;
+ *bdev_handle = device->bdev_handle;
synchronize_rcu();
btrfs_free_device(device);
@@ -2364,7 +2322,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
const char *path)
{
struct btrfs_super_block *disk_super;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
int ret;
if (!path || !path[0])
@@ -2382,7 +2340,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
}
ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
- &bdev, &disk_super);
+ &bdev_handle, &disk_super);
if (ret) {
btrfs_put_dev_args_from_path(args);
return ret;
@@ -2395,7 +2353,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- blkdev_put(bdev, NULL);
+ bdev_release(bdev_handle);
return 0;
}
@@ -2452,7 +2410,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
* Private copy of the seed devices, anchored at
* fs_info->fs_devices->seed_list
*/
- seed_devices = alloc_fs_devices(NULL, NULL);
+ seed_devices = alloc_fs_devices(NULL);
if (IS_ERR(seed_devices))
return seed_devices;
@@ -2598,7 +2556,7 @@ next_slot:
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
device->generation);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
}
path->slots[0]++;
@@ -2615,7 +2573,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct super_block *sb = fs_info->sb;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *seed_devices = NULL;
@@ -2628,12 +2586,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
- bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
+ bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+ fs_info->bdev_holder, NULL);
+ if (IS_ERR(bdev_handle))
+ return PTR_ERR(bdev_handle);
- if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
ret = -EINVAL;
goto error;
}
@@ -2645,11 +2603,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
locked = true;
}
- sync_blockdev(bdev);
+ sync_blockdev(bdev_handle->bdev);
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
- if (device->bdev == bdev) {
+ if (device->bdev == bdev_handle->bdev) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
@@ -2665,7 +2623,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
device->fs_info = fs_info;
- device->bdev = bdev;
+ device->bdev_handle = bdev_handle;
+ device->bdev = bdev_handle->bdev;
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error_free_device;
@@ -2686,12 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes =
- round_down(bdev_nr_bytes(bdev), fs_info->sectorsize);
+ round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- device->holder = fs_info->bdev_holder;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2727,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!bdev_nonrot(bdev))
+ if (!bdev_nonrot(device->bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2849,7 +2807,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- blkdev_put(bdev, fs_info->bdev_holder);
+ bdev_release(bdev_handle);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -2896,7 +2854,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
@@ -2930,6 +2888,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
btrfs_set_super_total_bytes(super_copy,
round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
+ atomic64_add(diff, &fs_info->free_chunk_space);
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
@@ -3028,7 +2987,8 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
}
/*
- * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
+ * Find the mapping containing the given logical extent.
+ *
* @logical: Logical block offset in bytes.
* @length: Length of extent in bytes.
*
@@ -3484,7 +3444,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
btrfs_set_balance_flags(leaf, item, bctl->flags);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
@@ -4839,6 +4799,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff;
u64 start;
+ u64 free_diff = 0;
new_size = round_down(new_size, fs_info->sectorsize);
start = new_size;
@@ -4864,7 +4825,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
btrfs_device_set_total_bytes(device, new_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
- atomic64_sub(diff, &fs_info->free_chunk_space);
+
+ /*
+ * The new free_chunk_space is new_size - used, so we have to
+ * subtract the delta of the old free_chunk_space which included
+ * old_size - used. If used > new_size then just subtract this
+ * entire device's free space.
+ */
+ if (device->bytes_used < new_size)
+ free_diff = (old_size - device->bytes_used) -
+ (new_size - device->bytes_used);
+ else
+ free_diff = old_size - device->bytes_used;
+ atomic64_sub(free_diff, &fs_info->free_chunk_space);
}
/*
@@ -4999,9 +4972,10 @@ done:
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes += diff;
- atomic64_add(diff, &fs_info->free_chunk_space);
+ atomic64_add(free_diff, &fs_info->free_chunk_space);
+ }
mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
@@ -5110,7 +5084,7 @@ static void init_alloc_chunk_ctl_policy_regular(
ASSERT(space_info);
ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
- ctl->max_stripe_size = ctl->max_chunk_size;
+ ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
@@ -5881,6 +5855,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical,
u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -5900,6 +5875,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
bioc->fs_info = fs_info;
bioc->replace_stripe_src = -1;
bioc->full_stripe_logical = (u64)-1;
+ bioc->logical = logical;
return bioc;
}
@@ -6204,12 +6180,20 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
return U64_MAX;
}
-static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
- u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ u64 logical, u64 *length, struct btrfs_io_stripe *dst,
+ struct map_lookup *map, u32 stripe_index,
+ u64 stripe_offset, u64 stripe_nr)
{
dst->dev = map->stripes[stripe_index].dev;
+
+ if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+ return btrfs_get_raid_extent_offset(fs_info, logical, length,
+ map->type, stripe_index, dst);
+
dst->physical = map->stripes[stripe_index].physical +
stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ return 0;
}
/*
@@ -6246,16 +6230,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *
* For RAID6 profile, mirror > 2 means mark another
* data/P stripe error and rebuild from the remaining
* stripes..
- *
- * @need_raid_map: (Used only for integrity checker) whether the map wants
- * a full stripe map (including all data and P/Q stripes)
- * for RAID56. Should always be 1 except integrity checker.
*/
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map)
+ struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6350,8 +6329,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) {
+ if (op != BTRFS_MAP_READ || mirror_num > 1) {
/*
+ * Needs full stripe mapping.
+ *
* Push stripe_nr back to the start of the full stripe
* For those cases needing a full stripe, @stripe_nr
* is the full stripe number.
@@ -6374,19 +6355,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
stripe_index = 0;
stripe_offset = 0;
} else {
- /*
- * Mirror #0 or #1 means the original data block.
- * Mirror #2 is RAID5 parity block.
- * Mirror #3 is RAID6 Q block.
- */
+ ASSERT(mirror_num <= 1);
+ /* Just grab the data stripe directly. */
stripe_index = stripe_nr % data_stripes;
stripe_nr /= data_stripes;
- if (mirror_num > 1)
- stripe_index = data_stripes + mirror_num - 2;
/* We distribute the parity blocks across stripes */
stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (op == BTRFS_MAP_READ && mirror_num <= 1)
+ if (op == BTRFS_MAP_READ && mirror_num < 1)
mirror_num = 1;
}
} else {
@@ -6425,16 +6401,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* I/O context structure.
*/
if (smap && num_alloc_stripes == 1 &&
+ !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
+ op != BTRFS_MAP_READ) &&
!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length, smap, map,
+ stripe_index, stripe_offset, stripe_nr);
if (mirror_num_ret)
*mirror_num_ret = mirror_num;
*bioc_ret = NULL;
- ret = 0;
goto out;
}
- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+ bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
if (!bioc) {
ret = -ENOMEM;
goto out;
@@ -6448,7 +6426,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
*
* It's still mostly the same as other profiles, just with extra rotation.
*/
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
(op != BTRFS_MAP_READ || mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
@@ -6460,22 +6438,35 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
*/
bioc->full_stripe_logical = em->start +
btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
- for (i = 0; i < num_stripes; i++)
- set_io_stripe(&bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
+ for (int i = 0; i < num_stripes; i++) {
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map,
+ (i + stripe_nr) % num_stripes,
+ stripe_offset, stripe_nr);
+ if (ret < 0)
+ break;
+ }
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
for (i = 0; i < num_stripes; i++) {
- set_io_stripe(&bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ ret = set_io_stripe(fs_info, op, logical, length,
+ &bioc->stripes[i], map, stripe_index,
+ stripe_offset, stripe_nr);
+ if (ret < 0)
+ break;
stripe_index++;
}
}
+ if (ret) {
+ *bioc_ret = NULL;
+ btrfs_put_bioc(bioc);
+ goto out;
+ }
+
if (op != BTRFS_MAP_READ)
max_errors = btrfs_chunk_max_errors(map);
@@ -6902,7 +6893,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
- fs_devices = alloc_fs_devices(fsid, NULL);
+ fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -7535,7 +7526,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
- btrfs_mark_buffer_dirty(eb);
+ btrfs_mark_buffer_dirty(trans, eb);
out:
btrfs_free_path(path);
@@ -8077,7 +8068,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
ASSERT(mirror_num > 0);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
- &bioc, smap, &mirror_ret, true);
+ &bioc, smap, &mirror_ret);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2128a032c3b7..9cc374864a79 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,13 +90,11 @@ struct btrfs_device {
u64 generation;
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
struct btrfs_zoned_device_info *zone_info;
- /* block device holder for blkdev_get/put */
- void *holder;
-
/*
* Device's major-minor number. Must be set even if the device is not
* opened (bdev == NULL), unless the device is missing.
@@ -290,6 +288,19 @@ struct btrfs_fs_devices {
* - Following shall be true at all times:
* - metadata_uuid == btrfs_header::fsid
* - metadata_uuid == btrfs_dev_item::fsid
+ *
+ * - Relations between fsid and metadata_uuid in sb and fs_devices:
+ * - Normal:
+ * fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid
+ * sb->metadata_uuid == 0
+ *
+ * - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set:
+ * fs_devices->fsid == sb->fsid
+ * fs_devices->metadata_uuid == sb->metadata_uuid
+ *
+ * - When in-memory fs_devices->temp_fsid is true
+ * fs_devices->fsid = random
+ * fs_devices->metadata_uuid == sb->fsid
*/
u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -353,9 +364,10 @@ struct btrfs_fs_devices {
bool rotating;
/* Devices support TRIM/discard commands. */
bool discardable;
- bool fsid_change;
/* The filesystem is a seed filesystem. */
bool seeding;
+ /* The mount needs to use a randomly generated fsid. */
+ bool temp_fsid;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -381,12 +393,12 @@ struct btrfs_fs_devices {
struct btrfs_io_stripe {
struct btrfs_device *dev;
- union {
- /* Block mapping */
- u64 physical;
- /* For the endio handler */
- struct btrfs_io_context *bioc;
- };
+ /* Block mapping. */
+ u64 physical;
+ u64 length;
+ bool is_scrub;
+ /* For the endio handler. */
+ struct btrfs_io_context *bioc;
};
struct btrfs_discard_stripe {
@@ -419,6 +431,11 @@ struct btrfs_io_context {
atomic_t error;
u16 max_errors;
+ u64 logical;
+ u64 size;
+ /* Raid stripe tree ordered entry. */
+ struct list_head rst_ordered_entry;
+
/*
* The total number of stripes, including the extra duplicated
* stripe for replace.
@@ -596,8 +613,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- struct btrfs_io_stripe *smap, int *mirror_num_ret,
- int need_raid_map);
+ struct btrfs_io_stripe *smap, int *mirror_num_ret);
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
struct btrfs_io_stripe *smap, u64 logical,
u32 length, int mirror_num);
@@ -611,7 +627,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags);
+struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+ bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -629,7 +646,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
- struct block_device **bdev, void **holder);
+ struct bdev_handle **bdev_handle);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 96828a13dd43..3cf236fb40a4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -188,15 +188,15 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
if (old_data_len + name_len + sizeof(*di) == item_size) {
/* No other xattrs packed in the same leaf item. */
if (size > old_data_len)
- btrfs_extend_item(path, size - old_data_len);
+ btrfs_extend_item(trans, path, size - old_data_len);
else if (size < old_data_len)
- btrfs_truncate_item(path, data_size, 1);
+ btrfs_truncate_item(trans, path, data_size, 1);
} else {
/* There are other xattrs packed in the same item. */
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto out;
- btrfs_extend_item(path, data_size);
+ btrfs_extend_item(trans, path, data_size);
}
ptr = btrfs_item_ptr(leaf, slot, char);
@@ -205,7 +205,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
write_extent_buffer(leaf, value, data_ptr, size);
- btrfs_mark_buffer_dirty(leaf);
+ btrfs_mark_buffer_dirty(trans, leaf);
} else {
/*
* Insert, and we had space for the xattr, so path->slots[0] is
@@ -265,7 +265,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -408,7 +408,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (!ret) {
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
}
@@ -442,7 +442,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = {
.set = btrfs_xattr_handler_set_prop,
};
-const struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler * const btrfs_xattr_handlers[] = {
&btrfs_security_xattr_handler,
&btrfs_trusted_xattr_handler,
&btrfs_user_xattr_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 1cd3fc0a8f17..118118ca3e1d 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -8,7 +8,7 @@
#include <linux/xattr.h>
-extern const struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler * const btrfs_xattr_handlers[];
int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 09bc325d075d..3504ade30cb0 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1282,21 +1282,284 @@ out:
return ret;
}
+struct zone_info {
+ u64 physical;
+ u64 capacity;
+ u64 alloc_offset;
+};
+
+static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
+ struct zone_info *info, unsigned long *active,
+ struct map_lookup *map)
+{
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_device *device = map->stripes[zone_idx].dev;
+ int dev_replace_is_ongoing = 0;
+ unsigned int nofs_flag;
+ struct blk_zone zone;
+ int ret;
+
+ info->physical = map->stripes[zone_idx].physical;
+
+ if (!device->bdev) {
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ /* Consider a zone as active if we can allow any number of active zones. */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(zone_idx, active);
+
+ if (!btrfs_dev_is_sequential(device, info->physical)) {
+ info->alloc_offset = WP_CONVENTIONAL;
+ return 0;
+ }
+
+ /* This zone will be used for allocation, so mark this zone non-empty. */
+ btrfs_dev_clear_zone_empty(device, info->physical);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write pointer
+ * to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, info->physical, &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret) {
+ if (ret != -EIO && ret != -EOPNOTSUPP)
+ return ret;
+ info->alloc_offset = WP_MISSING_DEV;
+ return 0;
+ }
+
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ device->devid);
+ return -EIO;
+ }
+
+ info->capacity = (zone.capacity << SECTOR_SHIFT);
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ (info->physical >> device->zone_info->zone_size_shift),
+ rcu_str_deref(device->name), device->devid);
+ info->alloc_offset = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ info->alloc_offset = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ info->alloc_offset = info->capacity;
+ break;
+ default:
+ /* Partially used zone. */
+ info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
+ __set_bit(zone_idx, active);
+ break;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
+ struct zone_info *info,
+ unsigned long *active)
+{
+ if (info->alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ info->physical);
+ return -EIO;
+ }
+
+ bg->alloc_offset = info->alloc_offset;
+ bg->zone_capacity = info->capacity;
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ return 0;
+}
+
+static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
+ return -EINVAL;
+ }
+
+ if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[0].physical);
+ return -EIO;
+ }
+ if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
+ btrfs_err(bg->fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ zone_info[1].physical);
+ return -EIO;
+ }
+ if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
+ btrfs_err(bg->fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ return -EIO;
+ }
+
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else if (test_bit(0, active)) {
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
+ return 0;
+}
+
+static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ int i;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
+ !btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in %s profile",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EIO;
+ }
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_test_opt(fs_info, DEGRADED) &&
+ !btrfs_zone_activate(bg)) {
+ return -EIO;
+ }
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ /* In case a device is missing we have a cap of 0, so don't use it. */
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity,
+ zone_info[1].capacity);
+ }
+
+ if (zone_info[0].alloc_offset != WP_MISSING_DEV)
+ bg->alloc_offset = zone_info[0].alloc_offset;
+ else
+ bg->alloc_offset = zone_info[i - 1].alloc_offset;
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+
+ return 0;
+}
+
+static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct zone_info *zone_info,
+ unsigned long *active)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
+ for (int i = 0; i < map->num_stripes; i++) {
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+ zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ continue;
+
+ if (test_bit(0, active) != test_bit(i, active)) {
+ if (!btrfs_zone_activate(bg))
+ return -EIO;
+ } else {
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+ }
+
+ if ((i % map->sub_stripes) == 0) {
+ bg->zone_capacity += zone_info[i].capacity;
+ bg->alloc_offset += zone_info[i].alloc_offset;
+ }
+ }
+
+ return 0;
+}
+
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_device *device;
u64 logical = cache->start;
u64 length = cache->length;
+ struct zone_info *zone_info = NULL;
int ret;
int i;
- unsigned int nofs_flag;
- u64 *alloc_offsets = NULL;
- u64 *caps = NULL;
- u64 *physical = NULL;
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1328,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
- alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
- if (!alloc_offsets) {
- ret = -ENOMEM;
- goto out;
- }
-
- caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
- if (!caps) {
- ret = -ENOMEM;
- goto out;
- }
-
- physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
- if (!physical) {
+ zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
+ if (!zone_info) {
ret = -ENOMEM;
goto out;
}
@@ -1353,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- bool is_sequential;
- struct blk_zone zone;
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
-
- device = map->stripes[i].dev;
- physical[i] = map->stripes[i].physical;
-
- if (device->bdev == NULL) {
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- }
-
- is_sequential = btrfs_dev_is_sequential(device, physical[i]);
- if (is_sequential)
- num_sequential++;
- else
- num_conventional++;
-
- /*
- * Consider a zone as active if we can allow any number of
- * active zones.
- */
- if (!device->zone_info->max_active_zones)
- __set_bit(i, active);
-
- if (!is_sequential) {
- alloc_offsets[i] = WP_CONVENTIONAL;
- continue;
- }
-
- /*
- * This zone will be used for allocation, so mark this zone
- * non-empty.
- */
- btrfs_dev_clear_zone_empty(device, physical[i]);
-
- down_read(&dev_replace->rwsem);
- dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
- up_read(&dev_replace->rwsem);
-
- /*
- * The group is mapped to a sequential zone. Get the zone write
- * pointer to determine the allocation offset within the zone.
- */
- WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
- nofs_flag = memalloc_nofs_save();
- ret = btrfs_get_dev_zone(device, physical[i], &zone);
- memalloc_nofs_restore(nofs_flag);
- if (ret == -EIO || ret == -EOPNOTSUPP) {
- ret = 0;
- alloc_offsets[i] = WP_MISSING_DEV;
- continue;
- } else if (ret) {
- goto out;
- }
-
- if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
- "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT,
- rcu_str_deref(device->name), device->devid);
- ret = -EIO;
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ if (ret)
goto out;
- }
- caps[i] = (zone.capacity << SECTOR_SHIFT);
-
- switch (zone.cond) {
- case BLK_ZONE_COND_OFFLINE:
- case BLK_ZONE_COND_READONLY:
- btrfs_err(fs_info,
- "zoned: offline/readonly zone %llu on device %s (devid %llu)",
- physical[i] >> device->zone_info->zone_size_shift,
- rcu_str_deref(device->name), device->devid);
- alloc_offsets[i] = WP_MISSING_DEV;
- break;
- case BLK_ZONE_COND_EMPTY:
- alloc_offsets[i] = 0;
- break;
- case BLK_ZONE_COND_FULL:
- alloc_offsets[i] = caps[i];
- break;
- default:
- /* Partially used zone */
- alloc_offsets[i] =
- ((zone.wp - zone.start) << SECTOR_SHIFT);
- __set_bit(i, active);
- break;
- }
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ num_conventional++;
+ else
+ num_sequential++;
}
if (num_sequential > 0)
@@ -1468,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = caps[0];
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
+ ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- if (map->type & BTRFS_BLOCK_GROUP_DATA) {
- btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
- ret = -EINVAL;
- goto out;
- }
- if (alloc_offsets[0] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[0]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[1] == WP_MISSING_DEV) {
- btrfs_err(fs_info,
- "zoned: cannot recover write pointer for zone %llu",
- physical[1]);
- ret = -EIO;
- goto out;
- }
- if (alloc_offsets[0] != alloc_offsets[1]) {
- btrfs_err(fs_info,
- "zoned: write pointer offset mismatch of zones in DUP profile");
- ret = -EIO;
- goto out;
- }
- if (test_bit(0, active) != test_bit(1, active)) {
- if (!btrfs_zone_activate(cache)) {
- ret = -EIO;
- goto out;
- }
- } else {
- if (test_bit(0, active))
- set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
- &cache->runtime_flags);
- }
- cache->alloc_offset = alloc_offsets[0];
- cache->zone_capacity = min(caps[0], caps[1]);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
break;
case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID0:
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID10:
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
- /* non-single profiles are not supported yet */
default:
btrfs_err(fs_info, "zoned: profile %s not yet supported",
btrfs_bg_type_to_raid_name(map->type));
@@ -1570,9 +1698,7 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
- kfree(physical);
- kfree(caps);
- kfree(alloc_offsets);
+ kfree(zone_info);
free_extent_map(em);
return ret;
@@ -1609,7 +1735,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans,
set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
set_extent_buffer_dirty(eb);
set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY | EXTENT_NOWAIT, NULL);
+ EXTENT_DIRTY, NULL);
}
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
@@ -1887,7 +2013,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
int i, ret;
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
- &mapped_length, &bioc, NULL, NULL, 1);
+ &mapped_length, &bioc, NULL, NULL);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
ret = -EIO;
goto out_put_bioc;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index e7ac4ec809a4..5511766485cd 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -145,7 +145,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
}
/*
- * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds
+ * Calculate monotonic memory bounds.
*
* It is possible based on the level configurations that a higher level
* workspace uses less memory than a lower level workspace. In order to reuse
@@ -218,7 +218,8 @@ void zstd_cleanup_workspace_manager(void)
}
/*
- * zstd_find_workspace - find workspace
+ * Find workspace for given level.
+ *
* @level: compression level
*
* This iterates over the set bits in the active_map beginning at the requested
@@ -256,7 +257,8 @@ static struct list_head *zstd_find_workspace(unsigned int level)
}
/*
- * zstd_get_workspace - zstd's get_workspace
+ * Zstd get_workspace for level.
+ *
* @level: compression level
*
* If @level is 0, then any compression level can be used. Therefore, we begin
@@ -296,7 +298,8 @@ again:
}
/*
- * zstd_put_workspace - zstd put_workspace
+ * Zstd put_workspace.
+ *
* @ws: list_head for the workspace
*
* When putting back a workspace, we only need to update the LRU if we are of
diff --git a/fs/buffer.c b/fs/buffer.c
index 2379564e5aea..12e9a71c693d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2011,7 +2011,7 @@ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
}
EXPORT_SYMBOL(folio_zero_new_buffers);
-static void
+static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
const struct iomap *iomap)
{
@@ -2025,7 +2025,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
* current block, then do not map the buffer and let the caller
* handle it.
*/
- BUG_ON(offset >= iomap->offset + iomap->length);
+ if (offset >= iomap->offset + iomap->length)
+ return -EIO;
switch (iomap->type) {
case IOMAP_HOLE:
@@ -2037,7 +2038,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
set_buffer_new(bh);
- break;
+ return 0;
case IOMAP_DELALLOC:
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
@@ -2045,7 +2046,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
set_buffer_uptodate(bh);
set_buffer_mapped(bh);
set_buffer_delay(bh);
- break;
+ return 0;
case IOMAP_UNWRITTEN:
/*
* For unwritten regions, we always need to ensure that regions
@@ -2057,12 +2058,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
fallthrough;
case IOMAP_MAPPED:
if ((iomap->flags & IOMAP_F_NEW) ||
- offset >= i_size_read(inode))
+ offset >= i_size_read(inode)) {
+ /*
+ * This can happen if truncating the block device races
+ * with the check in the caller as i_size updates on
+ * block devices aren't synchronized by i_rwsem for
+ * block devices.
+ */
+ if (S_ISBLK(inode->i_mode))
+ return -EIO;
set_buffer_new(bh);
+ }
bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
inode->i_blkbits;
set_buffer_mapped(bh);
- break;
+ return 0;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
}
}
@@ -2103,13 +2116,12 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- if (get_block) {
+ if (get_block)
err = get_block(inode, block, bh, 1);
- if (err)
- break;
- } else {
- iomap_to_bh(inode, block, bh, iomap);
- }
+ else
+ err = iomap_to_bh(inode, block, bh, iomap);
+ if (err)
+ break;
if (buffer_new(bh)) {
clean_bdev_bh_alias(bh);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f4863078f7fe..936b9e0b351d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -750,7 +750,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(osdc, req);
err = ceph_osdc_wait_request(osdc, req);
@@ -1327,7 +1327,7 @@ new_request:
pages = NULL;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
req = NULL;
@@ -1875,7 +1875,7 @@ int ceph_uninline_data(struct file *file)
goto out_unlock;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
@@ -1917,7 +1917,7 @@ int ceph_uninline_data(struct file *file)
goto out_put_req;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -2092,7 +2092,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
0, false, true);
ceph_osdc_start_request(&fsc->client->osdc, rd_req);
- wr_req->r_mtime = ci->netfs.inode.i_mtime;
+ wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode);
ceph_osdc_start_request(&fsc->client->osdc, wr_req);
err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 14215ec646f7..a104669fcf4c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1421,8 +1421,8 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
arg->old_xattr_buf = NULL;
}
- arg->mtime = inode->i_mtime;
- arg->atime = inode->i_atime;
+ arg->mtime = inode_get_mtime(inode);
+ arg->atime = inode_get_atime(inode);
arg->ctime = inode_get_ctime(inode);
arg->btime = ci->i_btime;
arg->change_attr = inode_peek_iversion_raw(inode);
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index e4d5cd56a80b..e3b1c3fab412 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -133,6 +133,7 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
}
static struct fscrypt_operations ceph_fscrypt_ops = {
+ .needs_bounce_pages = 1,
.get_context = ceph_crypt_get_context,
.set_context = ceph_crypt_set_context,
.get_dummy_policy = ceph_get_dummy_policy,
@@ -249,11 +250,9 @@ static struct inode *parse_longname(const struct inode *parent,
if (!dir) {
/* This can happen if we're not mounting cephfs on the root */
dir = ceph_get_inode(parent->i_sb, vino, NULL);
- if (!dir)
- dir = ERR_PTR(-ENOENT);
+ if (IS_ERR(dir))
+ dout("Can't find inode %s (%s)\n", inode_number, name);
}
- if (IS_ERR(dir))
- dout("Can't find inode %s (%s)\n", inode_number, name);
out:
kfree(inode_number);
@@ -462,7 +461,7 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname,
out:
fscrypt_fname_free_buffer(&_tname);
out_inode:
- if ((dir != fname->dir) && !IS_ERR(dir)) {
+ if (dir != fname->dir) {
if ((dir->i_state & I_NEW))
discard_new_inode(dir);
else
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index b1da02f5dbe3..649600d0a7b6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2489,7 +2489,7 @@ static int ceph_zero_partial_object(struct inode *inode,
goto out;
}
- req->r_mtime = inode->i_mtime;
+ req->r_mtime = inode_get_mtime(inode);
ceph_osdc_start_request(&fsc->client->osdc, req);
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
if (ret == -ENOENT)
@@ -2969,7 +2969,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags);
/* Abort on short copies or on error */
- if (ret < src_objlen) {
+ if (ret < (long)src_objlen) {
dout("Failed partial copy (%zd)\n", ret);
goto out;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 800ab7920513..2e2a303b9e64 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -185,9 +185,9 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_mtime = parent->i_mtime;
+ inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
- inode->i_atime = parent->i_atime;
+ inode_set_atime_to_ts(inode, inode_get_atime(parent));
ci->i_rbytes = 0;
ci->i_btime = ceph_inode(parent)->i_btime;
@@ -769,9 +769,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
ci->i_truncate_seq = truncate_seq;
/* the MDS should have revoked these caps */
- WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
- CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR |
+ WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_LAZYIO));
/*
* If we hold relevant caps, or in the case where we're
@@ -837,28 +835,31 @@ void ceph_fill_file_time(struct inode *inode, int issued,
/* the MDS did a utimes() */
dout("mtime %lld.%09ld -> %lld.%09ld "
"tw %d -> %d\n",
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+ inode_get_mtime_sec(inode),
+ inode_get_mtime_nsec(inode),
mtime->tv_sec, mtime->tv_nsec,
ci->i_time_warp_seq, (int)time_warp_seq);
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
+ inode_set_mtime_to_ts(inode, *mtime);
+ inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else if (time_warp_seq == ci->i_time_warp_seq) {
+ struct timespec64 ts;
+
/* nobody did utimes(); take the max */
- if (timespec64_compare(mtime, &inode->i_mtime) > 0) {
+ ts = inode_get_mtime(inode);
+ if (timespec64_compare(mtime, &ts) > 0) {
dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
- inode->i_mtime.tv_sec,
- inode->i_mtime.tv_nsec,
+ ts.tv_sec, ts.tv_nsec,
mtime->tv_sec, mtime->tv_nsec);
- inode->i_mtime = *mtime;
+ inode_set_mtime_to_ts(inode, *mtime);
}
- if (timespec64_compare(atime, &inode->i_atime) > 0) {
+ ts = inode_get_atime(inode);
+ if (timespec64_compare(atime, &ts) > 0) {
dout("atime %lld.%09ld -> %lld.%09ld inc\n",
- inode->i_atime.tv_sec,
- inode->i_atime.tv_nsec,
+ ts.tv_sec, ts.tv_nsec,
atime->tv_sec, atime->tv_nsec);
- inode->i_atime = *atime;
+ inode_set_atime_to_ts(inode, *atime);
}
} else if (issued & CEPH_CAP_FILE_EXCL) {
/* we did a utimes(); ignore mds values */
@@ -869,8 +870,8 @@ void ceph_fill_file_time(struct inode *inode, int issued,
/* we have no write|excl caps; whatever the MDS says is true */
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
inode_set_ctime_to_ts(inode, *ctime);
- inode->i_mtime = *mtime;
- inode->i_atime = *atime;
+ inode_set_mtime_to_ts(inode, *mtime);
+ inode_set_atime_to_ts(inode, *atime);
ci->i_time_warp_seq = time_warp_seq;
} else {
warn = 1;
@@ -2553,20 +2554,22 @@ retry:
}
if (ia_valid & ATTR_ATIME) {
+ struct timespec64 atime = inode_get_atime(inode);
+
dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
- inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+ atime.tv_sec, atime.tv_nsec,
attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
if (issued & CEPH_CAP_FILE_EXCL) {
ci->i_time_warp_seq++;
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec64_compare(&inode->i_atime,
- &attr->ia_atime) < 0) {
- inode->i_atime = attr->ia_atime;
+ timespec64_compare(&atime,
+ &attr->ia_atime) < 0) {
+ inode_set_atime_to_ts(inode, attr->ia_atime);
dirtied |= CEPH_CAP_FILE_WR;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
+ !timespec64_equal(&atime, &attr->ia_atime)) {
ceph_encode_timespec64(&req->r_args.setattr.atime,
&attr->ia_atime);
mask |= CEPH_SETATTR_ATIME;
@@ -2626,20 +2629,21 @@ retry:
}
}
if (ia_valid & ATTR_MTIME) {
+ struct timespec64 mtime = inode_get_mtime(inode);
+
dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
- inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+ mtime.tv_sec, mtime.tv_nsec,
attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
if (issued & CEPH_CAP_FILE_EXCL) {
ci->i_time_warp_seq++;
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_WR) &&
- timespec64_compare(&inode->i_mtime,
- &attr->ia_mtime) < 0) {
- inode->i_mtime = attr->ia_mtime;
+ timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
dirtied |= CEPH_CAP_FILE_WR;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
- !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
+ !timespec64_equal(&mtime, &attr->ia_mtime)) {
ceph_encode_timespec64(&req->r_args.setattr.mtime,
&attr->ia_mtime);
mask |= CEPH_SETATTR_MTIME;
@@ -2653,8 +2657,8 @@ retry:
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
- inode_get_ctime(inode).tv_sec,
- inode_get_ctime(inode).tv_nsec,
+ inode_get_ctime_sec(inode),
+ inode_get_ctime_nsec(inode),
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
only ? "ctime only" : "ignored");
if (only) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 615db141b6c4..de798444bb97 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -861,8 +861,8 @@ int ceph_wait_on_conflict_unlink(struct dentry *dentry)
if (!d_same_name(udentry, pdentry, &dname))
goto next;
+ found = dget_dlock(udentry);
spin_unlock(&udentry->d_lock);
- found = dget(udentry);
break;
next:
spin_unlock(&udentry->d_lock);
@@ -4353,12 +4353,16 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
+ struct timespec64 ts;
+
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v1.issued = cpu_to_le32(cap->issued);
rec.v1.size = cpu_to_le64(i_size_read(inode));
- ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
- ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
+ ts = inode_get_mtime(inode);
+ ceph_encode_timespec64(&rec.v1.mtime, &ts);
+ ts = inode_get_atime(inode);
+ ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v1.pathbase = cpu_to_le64(pathbase);
}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 813f21add992..6732e1ea97d9 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -658,8 +658,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
BUG_ON(capsnap->writing);
capsnap->size = i_size_read(inode);
- capsnap->mtime = inode->i_mtime;
- capsnap->atime = inode->i_atime;
+ capsnap->mtime = inode_get_mtime(inode);
+ capsnap->atime = inode_get_atime(inode);
capsnap->ctime = inode_get_ctime(inode);
capsnap->btime = ci->i_btime;
capsnap->change_attr = inode_peek_iversion_raw(inode);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 51c7f2b14f6f..98844fc8a2f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1119,7 +1119,7 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
-extern const struct xattr_handler *ceph_xattr_handlers[];
+extern const struct xattr_handler * const ceph_xattr_handlers[];
struct ceph_acl_sec_ctx {
#ifdef CONFIG_CEPH_FS_POSIX_ACL
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0deae4a0f5f1..097ce7f74073 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1446,7 +1446,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
* List of handlers for synthetic system.* attributes. Other
* attributes are handled directly.
*/
-const struct xattr_handler *ceph_xattr_handlers[] = {
+const struct xattr_handler * const ceph_xattr_handlers[] = {
&ceph_other_xattr_handler,
NULL,
};
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 950b6919fb87..6ba032442b39 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p)
struct module *owner = p->owner;
struct kobject *kobj;
- if (owner && !try_module_get(owner))
+ if (!try_module_get(owner))
return NULL;
kobj = kobject_get_unless_zero(&p->kobj);
if (!kobj)
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index ae023853a98f..1d2dac95f86a 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -123,9 +123,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
if (attr->va_size != -1)
inode->i_blocks = (attr->va_size + 511) >> 9;
if (attr->va_atime.tv_sec != -1)
- inode->i_atime = coda_to_timespec64(attr->va_atime);
+ inode_set_atime_to_ts(inode,
+ coda_to_timespec64(attr->va_atime));
if (attr->va_mtime.tv_sec != -1)
- inode->i_mtime = coda_to_timespec64(attr->va_mtime);
+ inode_set_mtime_to_ts(inode,
+ coda_to_timespec64(attr->va_mtime));
if (attr->va_ctime.tv_sec != -1)
inode_set_ctime_to_ts(inode,
coda_to_timespec64(attr->va_ctime));
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index cb512b10473b..4e552ba7bd43 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir)
/* optimistically we can also act as if our nose bleeds. The
* granularity of the mtime is coarse anyways so we might actually be
* right most of the time. Note: we only do this for directories. */
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
#endif
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 42346618b4ed..16acc58311ea 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
- coda_inode->i_mtime = inode_set_ctime_current(coda_inode);
+ inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode));
inode_unlock(coda_inode);
file_end_write(host_file);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index fbdcb3582926..dcc22f593e43 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -88,7 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -96,8 +96,8 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
inode->i_mode = iattr->ia_mode;
inode->i_uid = iattr->ia_uid;
inode->i_gid = iattr->ia_gid;
- inode->i_atime = iattr->ia_atime;
- inode->i_mtime = iattr->ia_mtime;
+ inode_set_atime_to_ts(inode, iattr->ia_atime);
+ inode_set_mtime_to_ts(inode, iattr->ia_mtime);
inode_set_ctime_to_ts(inode, iattr->ia_ctime);
}
@@ -171,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode)
return ERR_PTR(-ENOMEM);
p_inode = d_inode(dentry->d_parent);
- p_inode->i_mtime = inode_set_ctime_current(p_inode);
+ inode_set_mtime_to_ts(p_inode, inode_set_ctime_current(p_inode));
configfs_set_inode_lock_class(sd, inode);
return inode;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 5ee7d7bbb361..60dbfa0f8805 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -133,8 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
}
/* Struct copy intentional */
- inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
- zerotime);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime)));
/* inode->i_nlink is left 1 - arguably wrong for directories,
but it's the best we can do without reading the directory
contents. 1 yields the right result in GNU find, even
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
sb->s_mtd = NULL;
} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- blkdev_put(sb->s_bdev, sb);
+ bdev_release(sb->s_bdev_handle);
}
kfree(sbi);
}
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 62e1a3dd8357..0ad8c30b8fa5 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -111,10 +111,14 @@ out:
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
- const unsigned int blockbits = inode->i_blkbits;
- const unsigned int blocksize = 1 << blockbits;
- const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits;
- const unsigned int blocks_per_page = 1 << blocks_per_page_bits;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const unsigned int du_bits = ci->ci_data_unit_bits;
+ const unsigned int du_size = 1U << du_bits;
+ const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits;
+ const unsigned int du_per_page = 1U << du_per_page_bits;
+ u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits);
+ u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits);
+ sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT);
struct page *pages[16]; /* write up to 16 pages at a time */
unsigned int nr_pages;
unsigned int i;
@@ -130,8 +134,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
len);
BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
- nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
- (len + blocks_per_page - 1) >> blocks_per_page_bits);
+ nr_pages = min_t(u64, ARRAY_SIZE(pages),
+ (du_remaining + du_per_page - 1) >> du_per_page_bits);
/*
* We need at least one page for ciphertext. Allocate the first one
@@ -154,21 +158,22 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS);
do {
- bio->bi_iter.bi_sector = pblk << (blockbits - 9);
+ bio->bi_iter.bi_sector = sector;
i = 0;
offset = 0;
do {
- err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
- ZERO_PAGE(0), pages[i],
- blocksize, offset, GFP_NOFS);
+ err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
+ ZERO_PAGE(0), pages[i],
+ du_size, offset,
+ GFP_NOFS);
if (err)
goto out;
- lblk++;
- pblk++;
- len--;
- offset += blocksize;
- if (offset == PAGE_SIZE || len == 0) {
+ du_index++;
+ sector += 1U << (du_bits - SECTOR_SHIFT);
+ du_remaining--;
+ offset += du_size;
+ if (offset == PAGE_SIZE || du_remaining == 0) {
ret = bio_add_page(bio, pages[i++], offset, 0);
if (WARN_ON_ONCE(ret != offset)) {
err = -EIO;
@@ -176,13 +181,13 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
}
offset = 0;
}
- } while (i != nr_pages && len != 0);
+ } while (i != nr_pages && du_remaining != 0);
err = submit_bio_wait(bio);
if (err)
goto out;
bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
- } while (len != 0);
+ } while (du_remaining != 0);
err = 0;
out:
bio_put(bio);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6a837e4b80dc..328470d40dec 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
static struct workqueue_struct *fscrypt_read_workqueue;
static DEFINE_MUTEX(fscrypt_init_mutex);
-struct kmem_cache *fscrypt_info_cachep;
+struct kmem_cache *fscrypt_inode_info_cachep;
void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
@@ -49,6 +49,13 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
{
+ if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) {
+ /*
+ * Oops, the filesystem called a function that uses the bounce
+ * page pool, but it didn't set needs_bounce_pages.
+ */
+ return NULL;
+ }
return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
}
@@ -70,44 +77,44 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
EXPORT_SYMBOL(fscrypt_free_bounce_page);
/*
- * Generate the IV for the given logical block number within the given file.
- * For filenames encryption, lblk_num == 0.
+ * Generate the IV for the given data unit index within the given file.
+ * For filenames encryption, index == 0.
*
* Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks()
* needs to know about any IV generation methods where the low bits of IV don't
- * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ * simply contain the data unit index (e.g., IV_INO_LBLK_32).
*/
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
- const struct fscrypt_info *ci)
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+ const struct fscrypt_inode_info *ci)
{
u8 flags = fscrypt_policy_flags(&ci->ci_policy);
memset(iv, 0, ci->ci_mode->ivsize);
if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
- WARN_ON_ONCE(lblk_num > U32_MAX);
+ WARN_ON_ONCE(index > U32_MAX);
WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX);
- lblk_num |= (u64)ci->ci_inode->i_ino << 32;
+ index |= (u64)ci->ci_inode->i_ino << 32;
} else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
- WARN_ON_ONCE(lblk_num > U32_MAX);
- lblk_num = (u32)(ci->ci_hashed_ino + lblk_num);
+ WARN_ON_ONCE(index > U32_MAX);
+ index = (u32)(ci->ci_hashed_ino + index);
} else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE);
}
- iv->lblk_num = cpu_to_le64(lblk_num);
+ iv->index = cpu_to_le64(index);
}
-/* Encrypt or decrypt a single filesystem block of file contents */
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
- u64 lblk_num, struct page *src_page,
- struct page *dest_page, unsigned int len,
- unsigned int offs, gfp_t gfp_flags)
+/* Encrypt or decrypt a single "data unit" of file contents. */
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+ fscrypt_direction_t rw, u64 index,
+ struct page *src_page, struct page *dest_page,
+ unsigned int len, unsigned int offs,
+ gfp_t gfp_flags)
{
union fscrypt_iv iv;
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
- struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
int res = 0;
@@ -116,7 +123,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
return -EINVAL;
- fscrypt_generate_iv(&iv, lblk_num, ci);
+ fscrypt_generate_iv(&iv, index, ci);
req = skcipher_request_alloc(tfm, gfp_flags);
if (!req)
@@ -137,28 +144,29 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res) {
- fscrypt_err(inode, "%scryption failed for block %llu: %d",
- (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res);
+ fscrypt_err(ci->ci_inode,
+ "%scryption failed for data unit %llu: %d",
+ (rw == FS_DECRYPT ? "De" : "En"), index, res);
return res;
}
return 0;
}
/**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a
- * pagecache page
- * @page: The locked pagecache page containing the block(s) to encrypt
- * @len: Total size of the block(s) to encrypt. Must be a nonzero
- * multiple of the filesystem's block size.
- * @offs: Byte offset within @page of the first block to encrypt. Must be
- * a multiple of the filesystem's block size.
- * @gfp_flags: Memory allocation flags. See details below.
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
+ * @page: the locked pagecache page containing the data to encrypt
+ * @len: size of the data to encrypt, in bytes
+ * @offs: offset within @page of the data to encrypt, in bytes
+ * @gfp_flags: memory allocation flags; see details below
+ *
+ * This allocates a new bounce page and encrypts the given data into it. The
+ * length and offset of the data must be aligned to the file's crypto data unit
+ * size. Alignment to the filesystem block size fulfills this requirement, as
+ * the filesystem block size is always a multiple of the data unit size.
*
- * A new bounce page is allocated, and the specified block(s) are encrypted into
- * it. In the bounce page, the ciphertext block(s) will be located at the same
- * offsets at which the plaintext block(s) were located in the source page; any
- * other parts of the bounce page will be left uninitialized. However, normally
- * blocksize == PAGE_SIZE and the whole page is encrypted at once.
+ * In the bounce page, the ciphertext data will be located at the same offset at
+ * which the plaintext data was located in the source page. Any other parts of
+ * the bounce page will be left uninitialized.
*
* This is for use by the filesystem's ->writepages() method.
*
@@ -176,28 +184,29 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
{
const struct inode *inode = page->mapping->host;
- const unsigned int blockbits = inode->i_blkbits;
- const unsigned int blocksize = 1 << blockbits;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const unsigned int du_bits = ci->ci_data_unit_bits;
+ const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
- u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) +
- (offs >> blockbits);
+ u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+ (offs >> du_bits);
unsigned int i;
int err;
if (WARN_ON_ONCE(!PageLocked(page)))
return ERR_PTR(-EINVAL);
- if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+ if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
return ERR_PTR(-EINVAL);
ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags);
if (!ciphertext_page)
return ERR_PTR(-ENOMEM);
- for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
- err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num,
- page, ciphertext_page,
- blocksize, i, gfp_flags);
+ for (i = offs; i < offs + len; i += du_size, index++) {
+ err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
+ page, ciphertext_page,
+ du_size, i, gfp_flags);
if (err) {
fscrypt_free_bounce_page(ciphertext_page);
return ERR_PTR(err);
@@ -224,30 +233,33 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
* arbitrary page, not necessarily in the original pagecache page. The @inode
* and @lblk_num must be specified, as they can't be determined from @page.
*
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
* Return: 0 on success; -errno on failure
*/
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
u64 lblk_num, gfp_t gfp_flags)
{
- return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page,
- len, offs, gfp_flags);
+ if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+ return -EOPNOTSUPP;
+ return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
+ lblk_num, page, page, len, offs,
+ gfp_flags);
}
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
/**
- * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a
- * pagecache folio
- * @folio: The locked pagecache folio containing the block(s) to decrypt
- * @len: Total size of the block(s) to decrypt. Must be a nonzero
- * multiple of the filesystem's block size.
- * @offs: Byte offset within @folio of the first block to decrypt. Must be
- * a multiple of the filesystem's block size.
+ * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio
+ * @folio: the pagecache folio containing the data to decrypt
+ * @len: size of the data to decrypt, in bytes
+ * @offs: offset within @folio of the data to decrypt, in bytes
*
- * The specified block(s) are decrypted in-place within the pagecache folio,
- * which must still be locked and not uptodate.
- *
- * This is for use by the filesystem's ->readahead() method.
+ * Decrypt data that has just been read from an encrypted file. The data must
+ * be located in a pagecache folio that is still locked and not yet uptodate.
+ * The length and offset of the data must be aligned to the file's crypto data
+ * unit size. Alignment to the filesystem block size fulfills this requirement,
+ * as the filesystem block size is always a multiple of the data unit size.
*
* Return: 0 on success; -errno on failure
*/
@@ -255,25 +267,26 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
size_t offs)
{
const struct inode *inode = folio->mapping->host;
- const unsigned int blockbits = inode->i_blkbits;
- const unsigned int blocksize = 1 << blockbits;
- u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) +
- (offs >> blockbits);
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
+ const unsigned int du_bits = ci->ci_data_unit_bits;
+ const unsigned int du_size = 1U << du_bits;
+ u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
+ (offs >> du_bits);
size_t i;
int err;
if (WARN_ON_ONCE(!folio_test_locked(folio)))
return -EINVAL;
- if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize)))
+ if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
return -EINVAL;
- for (i = offs; i < offs + len; i += blocksize, lblk_num++) {
+ for (i = offs; i < offs + len; i += du_size, index++) {
struct page *page = folio_page(folio, i >> PAGE_SHIFT);
- err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page,
- page, blocksize, i & ~PAGE_MASK,
- GFP_NOFS);
+ err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
+ page, du_size, i & ~PAGE_MASK,
+ GFP_NOFS);
if (err)
return err;
}
@@ -295,14 +308,19 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
* arbitrary page, not necessarily in the original pagecache page. The @inode
* and @lblk_num must be specified, as they can't be determined from @page.
*
+ * This is not compatible with fscrypt_operations::supports_subblock_data_units.
+ *
* Return: 0 on success; -errno on failure
*/
int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
u64 lblk_num)
{
- return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page,
- len, offs, GFP_NOFS);
+ if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
+ return -EOPNOTSUPP;
+ return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
+ lblk_num, page, page, len, offs,
+ GFP_NOFS);
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
@@ -325,7 +343,7 @@ int fscrypt_initialize(struct super_block *sb)
return 0;
/* No need to allocate a bounce page pool if this FS won't use it. */
- if (sb->s_cop->flags & FS_CFLG_OWN_PAGES)
+ if (!sb->s_cop->needs_bounce_pages)
return 0;
mutex_lock(&fscrypt_init_mutex);
@@ -391,18 +409,19 @@ static int __init fscrypt_init(void)
if (!fscrypt_read_workqueue)
goto fail;
- fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
- if (!fscrypt_info_cachep)
+ fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_inode_info_cachep)
goto fail_free_queue;
err = fscrypt_init_keyring();
if (err)
- goto fail_free_info;
+ goto fail_free_inode_info;
return 0;
-fail_free_info:
- kmem_cache_destroy(fscrypt_info_cachep);
+fail_free_inode_info:
+ kmem_cache_destroy(fscrypt_inode_info_cachep);
fail_free_queue:
destroy_workqueue(fscrypt_read_workqueue);
fail:
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eae3f12ad50..7b3fc189593a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
{
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
- const struct fscrypt_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
struct scatterlist sg;
@@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
- const struct fscrypt_info *ci = inode->i_crypt_info;
+ const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
int res;
@@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
*/
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
{
- const struct fscrypt_info *ci = dir->i_crypt_info;
+ const struct fscrypt_inode_info *ci = dir->i_crypt_info;
WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 2d63da48635a..1892356cf924 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -47,7 +47,8 @@ struct fscrypt_context_v2 {
u8 contents_encryption_mode;
u8 filenames_encryption_mode;
u8 flags;
- u8 __reserved[4];
+ u8 log2_data_unit_size;
+ u8 __reserved[3];
u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};
@@ -165,6 +166,26 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
BUG();
}
+static inline int
+fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy,
+ const struct inode *inode)
+{
+ return policy->log2_data_unit_size ?: inode->i_blkbits;
+}
+
+static inline int
+fscrypt_policy_du_bits(const union fscrypt_policy *policy,
+ const struct inode *inode)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ return inode->i_blkbits;
+ case FSCRYPT_POLICY_V2:
+ return fscrypt_policy_v2_du_bits(&policy->v2, inode);
+ }
+ BUG();
+}
+
/*
* For encrypted symlinks, the ciphertext length is stored at the beginning
* of the string in little-endian format.
@@ -189,18 +210,18 @@ struct fscrypt_prepared_key {
};
/*
- * fscrypt_info - the "encryption key" for an inode
+ * fscrypt_inode_info - the "encryption key" for an inode
*
* When an encrypted file's key is made available, an instance of this struct is
* allocated and stored in ->i_crypt_info. Once created, it remains until the
* inode is evicted.
*/
-struct fscrypt_info {
+struct fscrypt_inode_info {
/* The key in a form prepared for actual encryption/decryption */
struct fscrypt_prepared_key ci_enc_key;
- /* True if ci_enc_key should be freed when this fscrypt_info is freed */
+ /* True if ci_enc_key should be freed when this struct is freed */
bool ci_owns_key;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
@@ -212,6 +233,16 @@ struct fscrypt_info {
#endif
/*
+ * log2 of the data unit size (granularity of contents encryption) of
+ * this file. This is computable from ci_policy and ci_inode but is
+ * cached here for efficiency. Only used for regular files.
+ */
+ u8 ci_data_unit_bits;
+
+ /* Cached value: log2 of number of data units per FS block */
+ u8 ci_data_units_per_block_bits;
+
+ /*
* Encryption mode used for this inode. It corresponds to either the
* contents or filenames encryption mode, depending on the inode type.
*/
@@ -263,12 +294,13 @@ typedef enum {
} fscrypt_direction_t;
/* crypto.c */
-extern struct kmem_cache *fscrypt_info_cachep;
+extern struct kmem_cache *fscrypt_inode_info_cachep;
int fscrypt_initialize(struct super_block *sb);
-int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
- u64 lblk_num, struct page *src_page,
- struct page *dest_page, unsigned int len,
- unsigned int offs, gfp_t gfp_flags);
+int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
+ fscrypt_direction_t rw, u64 index,
+ struct page *src_page, struct page *dest_page,
+ unsigned int len, unsigned int offs,
+ gfp_t gfp_flags);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
void __printf(3, 4) __cold
@@ -283,8 +315,8 @@ fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
union fscrypt_iv {
struct {
- /* logical block number within the file */
- __le64 lblk_num;
+ /* zero-based index of data unit within the file */
+ __le64 index;
/* per-file nonce; only set in DIRECT_KEY mode */
u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
@@ -293,8 +325,18 @@ union fscrypt_iv {
__le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
};
-void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
- const struct fscrypt_info *ci);
+void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
+ const struct fscrypt_inode_info *ci);
+
+/*
+ * Return the number of bits used by the maximum file data unit index that is
+ * possible on the given filesystem, using the given log2 data unit size.
+ */
+static inline int
+fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits)
+{
+ return fls64(sb->s_maxbytes - 1) - du_bits;
+}
/* fname.c */
bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
@@ -332,17 +374,17 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci);
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);
static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
return ci->ci_inlinecrypt;
}
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
- const struct fscrypt_info *ci);
+ const struct fscrypt_inode_info *ci);
void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key);
@@ -353,7 +395,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
*/
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
/*
* The two smp_load_acquire()'s here pair with the smp_store_release()'s
@@ -370,13 +412,13 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
-static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
{
return 0;
}
static inline bool
-fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
+fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
return false;
}
@@ -384,7 +426,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
static inline int
fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
WARN_ON_ONCE(1);
return -EOPNOTSUPP;
@@ -398,7 +440,7 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
return smp_load_acquire(&prep_key->tfm) != NULL;
}
@@ -433,8 +475,28 @@ struct fscrypt_master_key_secret {
* fscrypt_master_key - an in-use master key
*
* This represents a master encryption key which has been added to the
- * filesystem and can be used to "unlock" the encrypted files which were
- * encrypted with it.
+ * filesystem. There are three high-level states that a key can be in:
+ *
+ * FSCRYPT_KEY_STATUS_PRESENT
+ * Key is fully usable; it can be used to unlock inodes that are encrypted
+ * with it (this includes being able to create new inodes). ->mk_present
+ * indicates whether the key is in this state. ->mk_secret exists, the key
+ * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present.
+ *
+ * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED
+ * Removal of this key has been initiated, but some inodes that were
+ * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped,
+ * and the key can no longer be used to unlock inodes. Unlike ABSENT, the
+ * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and
+ * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes.
+ *
+ * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty,
+ * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key.
+ *
+ * FSCRYPT_KEY_STATUS_ABSENT
+ * Key is fully removed. The key is no longer in the keyring,
+ * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is
+ * wiped, and the key can no longer be used to unlock inodes.
*/
struct fscrypt_master_key {
@@ -444,7 +506,7 @@ struct fscrypt_master_key {
*/
struct hlist_node mk_node;
- /* Semaphore that protects ->mk_secret and ->mk_users */
+ /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */
struct rw_semaphore mk_sem;
/*
@@ -454,8 +516,8 @@ struct fscrypt_master_key {
* ->mk_direct_keys) that have been prepared continue to exist.
* A structural ref only guarantees that the struct continues to exist.
*
- * There is one active ref associated with ->mk_secret being present,
- * and one active ref for each inode in ->mk_decrypted_inodes.
+ * There is one active ref associated with ->mk_present being true, and
+ * one active ref for each inode in ->mk_decrypted_inodes.
*
* There is one structural ref associated with the active refcount being
* nonzero. Finding a key in the keyring also takes a structural ref,
@@ -467,17 +529,10 @@ struct fscrypt_master_key {
struct rcu_head mk_rcu_head;
/*
- * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is
- * executed, this is wiped and no new inodes can be unlocked with this
- * key; however, there may still be inodes in ->mk_decrypted_inodes
- * which could not be evicted. As long as some inodes still remain,
- * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
- * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
+ * The secret key material. Wiped as soon as it is no longer needed;
+ * for details, see the fscrypt_master_key struct comment.
*
- * While ->mk_secret is present, one ref in ->mk_active_refs is held.
- *
- * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs
- * associated with this field is protected by ->mk_sem as well.
+ * Locking: protected by ->mk_sem.
*/
struct fscrypt_master_key_secret mk_secret;
@@ -500,7 +555,7 @@ struct fscrypt_master_key {
*
* Locking: protected by ->mk_sem. (We don't just rely on the keyrings
* subsystem semaphore ->mk_users->sem, as we need support for atomic
- * search+insert along with proper synchronization with ->mk_secret.)
+ * search+insert along with proper synchronization with other fields.)
*/
struct key *mk_users;
@@ -523,20 +578,17 @@ struct fscrypt_master_key {
siphash_key_t mk_ino_hash_key;
bool mk_ino_hash_key_initialized;
-} __randomize_layout;
-
-static inline bool
-is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
-{
/*
- * The READ_ONCE() is only necessary for fscrypt_drop_inode().
- * fscrypt_drop_inode() runs in atomic context, so it can't take the key
- * semaphore and thus 'secret' can change concurrently which would be a
- * data race. But fscrypt_drop_inode() only need to know whether the
- * secret *was* present at the time of check, so READ_ONCE() suffices.
+ * Whether this key is in the "present" state, i.e. fully usable. For
+ * details, see the fscrypt_master_key struct comment.
+ *
+ * Locking: protected by ->mk_sem, but can be read locklessly using
+ * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers
+ * are possible.
*/
- return READ_ONCE(secret->size) != 0;
-}
+ bool mk_present;
+
+} __randomize_layout;
static inline const char *master_key_spec_type(
const struct fscrypt_key_specifier *spec)
@@ -598,17 +650,18 @@ struct fscrypt_mode {
extern struct fscrypt_mode fscrypt_modes[];
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key, const struct fscrypt_info *ci);
+ const u8 *raw_key, const struct fscrypt_inode_info *ci);
void fscrypt_destroy_prepared_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key);
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+ const u8 *raw_key);
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk);
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk);
int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);
@@ -643,10 +696,11 @@ static inline int fscrypt_require_key(struct inode *inode)
void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci,
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
const u8 *raw_master_key);
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
+int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
+ struct fscrypt_inode_info *ci);
/* policy.c */
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 6238dbcadcad..52504dd478d3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);
int fscrypt_prepare_setflags(struct inode *inode,
unsigned int oldflags, unsigned int flags)
{
- struct fscrypt_info *ci;
+ struct fscrypt_inode_info *ci;
struct fscrypt_master_key *mk;
int err;
@@ -187,7 +187,7 @@ int fscrypt_prepare_setflags(struct inode *inode,
return -EINVAL;
mk = ci->ci_master_key;
down_read(&mk->mk_sem);
- if (is_master_key_secret_present(&mk->mk_secret))
+ if (mk->mk_present)
err = fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 8bfb3ce86476..b4002aea7cdb 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -39,11 +39,11 @@ static struct block_device **fscrypt_get_devices(struct super_block *sb,
return devs;
}
-static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
+static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_inode_info *ci)
{
- struct super_block *sb = ci->ci_inode->i_sb;
+ const struct super_block *sb = ci->ci_inode->i_sb;
unsigned int flags = fscrypt_policy_flags(&ci->ci_policy);
- int ino_bits = 64, lblk_bits = 64;
+ int dun_bits;
if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
return offsetofend(union fscrypt_iv, nonce);
@@ -54,10 +54,9 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
return sizeof(__le32);
- /* Default case: IVs are just the file logical block number */
- if (sb->s_cop->get_ino_and_lblk_bits)
- sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
- return DIV_ROUND_UP(lblk_bits, 8);
+ /* Default case: IVs are just the file data unit index */
+ dun_bits = fscrypt_max_file_dun_bits(sb, ci->ci_data_unit_bits);
+ return DIV_ROUND_UP(dun_bits, 8);
}
/*
@@ -90,7 +89,7 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
}
/* Enable inline encryption for this file if supported. */
-int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
@@ -129,7 +128,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
* crypto configuration that the file would use.
*/
crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
- crypto_cfg.data_unit_size = sb->s_blocksize;
+ crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
devs = fscrypt_get_devices(sb, &num_devs);
@@ -152,7 +151,7 @@ out_free_devs:
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
@@ -168,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
return -ENOMEM;
err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
- fscrypt_get_dun_bytes(ci), sb->s_blocksize);
+ fscrypt_get_dun_bytes(ci),
+ 1U << ci->ci_data_unit_bits);
if (err) {
fscrypt_err(inode, "error %d initializing blk-crypto key", err);
goto fail;
@@ -232,13 +232,15 @@ bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
}
EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto);
-static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
+static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci,
+ u64 lblk_num,
u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
{
+ u64 index = lblk_num << ci->ci_data_units_per_block_bits;
union fscrypt_iv iv;
int i;
- fscrypt_generate_iv(&iv, lblk_num, ci);
+ fscrypt_generate_iv(&iv, index, ci);
BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE);
memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE);
@@ -265,7 +267,7 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num,
void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
u64 first_lblk, gfp_t gfp_mask)
{
- const struct fscrypt_info *ci;
+ const struct fscrypt_inode_info *ci;
u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
if (!fscrypt_inode_uses_inline_crypto(inode))
@@ -456,7 +458,7 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
*/
u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
{
- const struct fscrypt_info *ci;
+ const struct fscrypt_inode_info *ci;
u32 dun;
if (!fscrypt_inode_uses_inline_crypto(inode))
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 7cbb1fd872ac..f34a9b0b9e92 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -99,10 +99,10 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
spin_unlock(&sb->s_master_keys->lock);
/*
- * ->mk_active_refs == 0 implies that ->mk_secret is not present and
- * that ->mk_decrypted_inodes is empty.
+ * ->mk_active_refs == 0 implies that ->mk_present is false and
+ * ->mk_decrypted_inodes is empty.
*/
- WARN_ON_ONCE(is_master_key_secret_present(&mk->mk_secret));
+ WARN_ON_ONCE(mk->mk_present);
WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes));
for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
@@ -121,6 +121,18 @@ void fscrypt_put_master_key_activeref(struct super_block *sb,
fscrypt_put_master_key(mk);
}
+/*
+ * This transitions the key state from present to incompletely removed, and then
+ * potentially to absent (depending on whether inodes remain).
+ */
+static void fscrypt_initiate_key_removal(struct super_block *sb,
+ struct fscrypt_master_key *mk)
+{
+ WRITE_ONCE(mk->mk_present, false);
+ wipe_master_key_secret(&mk->mk_secret);
+ fscrypt_put_master_key_activeref(sb, mk);
+}
+
static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
{
if (spec->__reserved)
@@ -234,14 +246,13 @@ void fscrypt_destroy_keyring(struct super_block *sb)
* evicted, every key remaining in the keyring should
* have an empty inode list, and should only still be in
* the keyring due to the single active ref associated
- * with ->mk_secret. There should be no structural refs
- * beyond the one associated with the active ref.
+ * with ->mk_present. There should be no structural
+ * refs beyond the one associated with the active ref.
*/
WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1);
WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1);
- WARN_ON_ONCE(!is_master_key_secret_present(&mk->mk_secret));
- wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(sb, mk);
+ WARN_ON_ONCE(!mk->mk_present);
+ fscrypt_initiate_key_removal(sb, mk);
}
}
kfree_sensitive(keyring);
@@ -439,7 +450,8 @@ static int add_new_master_key(struct super_block *sb,
}
move_master_key_secret(&mk->mk_secret, secret);
- refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
+ mk->mk_present = true;
+ refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */
spin_lock(&keyring->lock);
hlist_add_head_rcu(&mk->mk_node,
@@ -478,11 +490,18 @@ static int add_existing_master_key(struct fscrypt_master_key *mk,
return err;
}
- /* Re-add the secret if needed. */
- if (!is_master_key_secret_present(&mk->mk_secret)) {
- if (!refcount_inc_not_zero(&mk->mk_active_refs))
+ /* If the key is incompletely removed, make it present again. */
+ if (!mk->mk_present) {
+ if (!refcount_inc_not_zero(&mk->mk_active_refs)) {
+ /*
+ * Raced with the last active ref being dropped, so the
+ * key has become, or is about to become, "absent".
+ * Therefore, we need to allocate a new key struct.
+ */
return KEY_DEAD;
+ }
move_master_key_secret(&mk->mk_secret, secret);
+ WRITE_ONCE(mk->mk_present, true);
}
return 0;
@@ -506,8 +525,8 @@ static int do_add_master_key(struct super_block *sb,
err = add_new_master_key(sb, secret, mk_spec);
} else {
/*
- * Found the key in ->s_master_keys. Re-add the secret if
- * needed, and add the user to ->mk_users if needed.
+ * Found the key in ->s_master_keys. Add the user to ->mk_users
+ * if needed, and make the key "present" again if possible.
*/
down_write(&mk->mk_sem);
err = add_existing_master_key(mk, secret);
@@ -867,7 +886,7 @@ static void shrink_dcache_inode(struct inode *inode)
static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
{
- struct fscrypt_info *ci;
+ struct fscrypt_inode_info *ci;
struct inode *inode;
struct inode *toput_inode = NULL;
@@ -917,7 +936,7 @@ static int check_for_busy_inodes(struct super_block *sb,
/* select an example file to show for debugging purposes */
struct inode *inode =
list_first_entry(&mk->mk_decrypted_inodes,
- struct fscrypt_info,
+ struct fscrypt_inode_info,
ci_master_key_link)->ci_inode;
ino = inode->i_ino;
}
@@ -989,9 +1008,8 @@ static int try_to_lock_encrypted_files(struct super_block *sb,
*
* If all inodes were evicted, then we unlink the fscrypt_master_key from the
* keyring. Otherwise it remains in the keyring in the "incompletely removed"
- * state (without the actual secret key) where it tracks the list of remaining
- * inodes. Userspace can execute the ioctl again later to retry eviction, or
- * alternatively can re-add the secret key again.
+ * state where it tracks the list of remaining inodes. Userspace can execute
+ * the ioctl again later to retry eviction, or alternatively can re-add the key.
*
* For more details, see the "Removing keys" section of
* Documentation/filesystems/fscrypt.rst.
@@ -1053,11 +1071,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
}
}
- /* No user claims remaining. Go ahead and wipe the secret. */
+ /* No user claims remaining. Initiate removal of the key. */
err = -ENOKEY;
- if (is_master_key_secret_present(&mk->mk_secret)) {
- wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(sb, mk);
+ if (mk->mk_present) {
+ fscrypt_initiate_key_removal(sb, mk);
err = 0;
}
inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
@@ -1074,9 +1091,9 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
}
/*
* We return 0 if we successfully did something: removed a claim to the
- * key, wiped the secret, or tried locking the files again. Users need
- * to check the informational status flags if they care whether the key
- * has been fully removed including all files locked.
+ * key, initiated removal of the key, or tried locking the files again.
+ * Users need to check the informational status flags if they care
+ * whether the key has been fully removed including all files locked.
*/
out_put_key:
fscrypt_put_master_key(mk);
@@ -1103,12 +1120,11 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users);
* Retrieve the status of an fscrypt master encryption key.
*
* We set ->status to indicate whether the key is absent, present, or
- * incompletely removed. "Incompletely removed" means that the master key
- * secret has been removed, but some files which had been unlocked with it are
- * still in use. This field allows applications to easily determine the state
- * of an encrypted directory without using a hack such as trying to open a
- * regular file in it (which can confuse the "incompletely removed" state with
- * absent or present).
+ * incompletely removed. (For an explanation of what these statuses mean and
+ * how they are represented internally, see struct fscrypt_master_key.) This
+ * field allows applications to easily determine the status of an encrypted
+ * directory without using a hack such as trying to open a regular file in it
+ * (which can confuse the "incompletely removed" status with absent or present).
*
* In addition, for v2 policy keys we allow applications to determine, via
* ->status_flags and ->user_count, whether the key has been added by the
@@ -1150,7 +1166,7 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
}
down_read(&mk->mk_sem);
- if (!is_master_key_secret_present(&mk->mk_secret)) {
+ if (!mk->mk_present) {
arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 361f41ef46c7..d71f7c799e79 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -148,7 +148,7 @@ err_free_tfm:
* and IV generation method (@ci->ci_policy.flags).
*/
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key, const struct fscrypt_info *ci)
+ const u8 *raw_key, const struct fscrypt_inode_info *ci)
{
struct crypto_skcipher *tfm;
@@ -178,13 +178,14 @@ void fscrypt_destroy_prepared_key(struct super_block *sb,
}
/* Given a per-file encryption key, set up the file's crypto transform object */
-int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
+int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
+ const u8 *raw_key)
{
ci->ci_owns_key = true;
return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
}
-static int setup_per_mode_enc_key(struct fscrypt_info *ci,
+static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk,
struct fscrypt_prepared_key *keys,
u8 hkdf_context, bool include_fs_uuid)
@@ -265,7 +266,7 @@ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
return 0;
}
-int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk)
{
int err;
@@ -279,7 +280,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
return 0;
}
-void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
const struct fscrypt_master_key *mk)
{
WARN_ON_ONCE(ci->ci_inode->i_ino == 0);
@@ -289,7 +290,7 @@ void fscrypt_hash_inode_number(struct fscrypt_info *ci,
&mk->mk_ino_hash_key);
}
-static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
+static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk)
{
int err;
@@ -329,7 +330,7 @@ unlock:
return 0;
}
-static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
+static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk,
bool need_dirhash_key)
{
@@ -404,7 +405,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* still allow 512-bit master keys if the user chooses to use them, though.)
*/
static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
- const struct fscrypt_info *ci)
+ const struct fscrypt_inode_info *ci)
{
unsigned int min_keysize;
@@ -430,11 +431,12 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
*
* If the master key is found in the filesystem-level keyring, then it is
* returned in *mk_ret with its semaphore read-locked. This is needed to ensure
- * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
- * multiple tasks may race to create an fscrypt_info for the same inode), and to
- * synchronize the master key being removed with a new inode starting to use it.
+ * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes
+ * (as multiple tasks may race to create an fscrypt_inode_info for the same
+ * inode), and to synchronize the master key being removed with a new inode
+ * starting to use it.
*/
-static int setup_file_encryption_key(struct fscrypt_info *ci,
+static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
bool need_dirhash_key,
struct fscrypt_master_key **mk_ret)
{
@@ -484,8 +486,8 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
}
down_read(&mk->mk_sem);
- /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
- if (!is_master_key_secret_present(&mk->mk_secret)) {
+ if (!mk->mk_present) {
+ /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */
err = -ENOKEY;
goto out_release_key;
}
@@ -519,7 +521,7 @@ out_release_key:
return err;
}
-static void put_crypt_info(struct fscrypt_info *ci)
+static void put_crypt_info(struct fscrypt_inode_info *ci)
{
struct fscrypt_master_key *mk;
@@ -537,8 +539,8 @@ static void put_crypt_info(struct fscrypt_info *ci)
/*
* Remove this inode from the list of inodes that were unlocked
* with the master key. In addition, if we're removing the last
- * inode from a master key struct that already had its secret
- * removed, then complete the full removal of the struct.
+ * inode from an incompletely removed key, then complete the
+ * full removal of the key.
*/
spin_lock(&mk->mk_decrypted_inodes_lock);
list_del(&ci->ci_master_key_link);
@@ -546,7 +548,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
}
memzero_explicit(ci, sizeof(*ci));
- kmem_cache_free(fscrypt_info_cachep, ci);
+ kmem_cache_free(fscrypt_inode_info_cachep, ci);
}
static int
@@ -555,7 +557,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
bool need_dirhash_key)
{
- struct fscrypt_info *crypt_info;
+ struct fscrypt_inode_info *crypt_info;
struct fscrypt_mode *mode;
struct fscrypt_master_key *mk = NULL;
int res;
@@ -564,7 +566,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
if (res)
return res;
- crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
+ crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL);
if (!crypt_info)
return -ENOMEM;
@@ -580,6 +582,11 @@ fscrypt_setup_encryption_info(struct inode *inode,
WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
+ crypt_info->ci_data_unit_bits =
+ fscrypt_policy_du_bits(&crypt_info->ci_policy, inode);
+ crypt_info->ci_data_units_per_block_bits =
+ inode->i_blkbits - crypt_info->ci_data_unit_bits;
+
res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
if (res)
goto out;
@@ -587,8 +594,8 @@ fscrypt_setup_encryption_info(struct inode *inode,
/*
* For existing inodes, multiple tasks may race to set ->i_crypt_info.
* So use cmpxchg_release(). This pairs with the smp_load_acquire() in
- * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
- * RELEASE barrier so that other tasks can ACQUIRE it.
+ * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with
+ * a RELEASE barrier so that other tasks can ACQUIRE it.
*/
if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
/*
@@ -735,8 +742,8 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
* fscrypt_put_encryption_info() - free most of an inode's fscrypt data
* @inode: an inode being evicted
*
- * Free the inode's fscrypt_info. Filesystems must call this when the inode is
- * being evicted. An RCU grace period need not have elapsed yet.
+ * Free the inode's fscrypt_inode_info. Filesystems must call this when the
+ * inode is being evicted. An RCU grace period need not have elapsed yet.
*/
void fscrypt_put_encryption_info(struct inode *inode)
{
@@ -773,7 +780,7 @@ EXPORT_SYMBOL(fscrypt_free_inode);
*/
int fscrypt_drop_inode(struct inode *inode)
{
- const struct fscrypt_info *ci = fscrypt_get_info(inode);
+ const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode);
/*
* If ci is NULL, then the inode doesn't have an encryption key set up
@@ -794,13 +801,14 @@ int fscrypt_drop_inode(struct inode *inode)
return 0;
/*
- * Note: since we aren't holding the key semaphore, the result here can
+ * We can't take ->mk_sem here, since this runs in atomic context.
+ * Therefore, ->mk_present can change concurrently, and our result may
* immediately become outdated. But there's no correctness problem with
* unnecessarily evicting. Nor is there a correctness problem with not
* evicting while iput() is racing with the key being removed, since
* then the thread removing the key will either evict the inode itself
* or will correctly detect that it wasn't evicted due to the race.
*/
- return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
+ return !READ_ONCE(ci->ci_master_key->mk_present);
}
EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 75dabd9b27f9..a10710bc8123 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -178,7 +178,8 @@ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk)
*/
static struct fscrypt_direct_key *
find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
- const u8 *raw_key, const struct fscrypt_info *ci)
+ const u8 *raw_key,
+ const struct fscrypt_inode_info *ci)
{
unsigned long hash_key;
struct fscrypt_direct_key *dk;
@@ -218,7 +219,7 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert,
/* Prepare to encrypt directly using the master key in the given mode */
static struct fscrypt_direct_key *
-fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
+fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key)
{
struct fscrypt_direct_key *dk;
int err;
@@ -250,7 +251,7 @@ err_free_dk:
}
/* v1 policy, DIRECT_KEY: use the master key directly */
-static int setup_v1_file_key_direct(struct fscrypt_info *ci,
+static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci,
const u8 *raw_master_key)
{
struct fscrypt_direct_key *dk;
@@ -264,7 +265,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
}
/* v1 policy, !DIRECT_KEY: derive the file's encryption key */
-static int setup_v1_file_key_derived(struct fscrypt_info *ci,
+static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci,
const u8 *raw_master_key)
{
u8 *derived_key;
@@ -289,7 +290,8 @@ out:
return err;
}
-int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
+int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
+ const u8 *raw_master_key)
{
if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY)
return setup_v1_file_key_direct(ci, raw_master_key);
@@ -297,8 +299,10 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key)
return setup_v1_file_key_derived(ci, raw_master_key);
}
-int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
+int
+fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci)
{
+ const struct super_block *sb = ci->ci_inode->i_sb;
struct key *key;
const struct fscrypt_key *payload;
int err;
@@ -306,8 +310,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci)
key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX,
ci->ci_policy.v1.master_key_descriptor,
ci->ci_mode->keysize, &payload);
- if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) {
- key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix,
+ if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) {
+ key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix,
ci->ci_policy.v1.master_key_descriptor,
ci->ci_mode->keysize, &payload);
}
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index f4456ecb3f87..701259991277 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -118,12 +118,11 @@ static bool supported_direct_key_modes(const struct inode *inode,
}
static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
- const struct inode *inode,
- const char *type,
- int max_ino_bits, int max_lblk_bits)
+ const struct inode *inode)
{
+ const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
+ ? "IV_INO_LBLK_64" : "IV_INO_LBLK_32";
struct super_block *sb = inode->i_sb;
- int ino_bits = 64, lblk_bits = 64;
/*
* IV_INO_LBLK_* exist only because of hardware limitations, and
@@ -150,17 +149,29 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
type, sb->s_id);
return false;
}
- if (sb->s_cop->get_ino_and_lblk_bits)
- sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
- if (ino_bits > max_ino_bits) {
+
+ /*
+ * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit
+ * in 32 bits. In principle, IV_INO_LBLK_32 could support longer inode
+ * numbers because it hashes the inode number; however, currently the
+ * inode number is gotten from inode::i_ino which is 'unsigned long'.
+ * So for now the implementation limit is 32 bits.
+ */
+ if (!sb->s_cop->has_32bit_inodes) {
fscrypt_warn(inode,
"Can't use %s policy on filesystem '%s' because its inode numbers are too long",
type, sb->s_id);
return false;
}
- if (lblk_bits > max_lblk_bits) {
+
+ /*
+ * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit
+ * indices fit in 32 bits.
+ */
+ if (fscrypt_max_file_dun_bits(sb,
+ fscrypt_policy_v2_du_bits(policy, inode)) > 32) {
fscrypt_warn(inode,
- "Can't use %s policy on filesystem '%s' because its block numbers are too long",
+ "Can't use %s policy on filesystem '%s' because its maximum file size is too large",
type, sb->s_id);
return false;
}
@@ -233,25 +244,39 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
return false;
}
+ if (policy->log2_data_unit_size) {
+ if (!inode->i_sb->s_cop->supports_subblock_data_units) {
+ fscrypt_warn(inode,
+ "Filesystem does not support configuring crypto data unit size");
+ return false;
+ }
+ if (policy->log2_data_unit_size > inode->i_blkbits ||
+ policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) {
+ fscrypt_warn(inode,
+ "Unsupported log2_data_unit_size in encryption policy: %d",
+ policy->log2_data_unit_size);
+ return false;
+ }
+ if (policy->log2_data_unit_size != inode->i_blkbits &&
+ (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+ /*
+ * Not safe to enable yet, as we need to ensure that DUN
+ * wraparound can only occur on a FS block boundary.
+ */
+ fscrypt_warn(inode,
+ "Sub-block data units not yet supported with IV_INO_LBLK_32");
+ return false;
+ }
+ }
+
if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
!supported_direct_key_modes(inode, policy->contents_encryption_mode,
policy->filenames_encryption_mode))
return false;
- if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
- !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64",
- 32, 32))
- return false;
-
- /*
- * IV_INO_LBLK_32 hashes the inode number, so in principle it can
- * support any ino_bits. However, currently the inode number is gotten
- * from inode::i_ino which is 'unsigned long'. So for now the
- * implementation limit is 32 bits.
- */
- if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
- !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
- 32, 32))
+ if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) &&
+ !supported_iv_ino_lblk_policy(policy, inode))
return false;
if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -330,6 +355,7 @@ static int fscrypt_new_context(union fscrypt_context *ctx_u,
ctx->filenames_encryption_mode =
policy->filenames_encryption_mode;
ctx->flags = policy->flags;
+ ctx->log2_data_unit_size = policy->log2_data_unit_size;
memcpy(ctx->master_key_identifier,
policy->master_key_identifier,
sizeof(ctx->master_key_identifier));
@@ -390,6 +416,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
policy->filenames_encryption_mode =
ctx->filenames_encryption_mode;
policy->flags = ctx->flags;
+ policy->log2_data_unit_size = ctx->log2_data_unit_size;
memcpy(policy->__reserved, ctx->__reserved,
sizeof(policy->__reserved));
memcpy(policy->master_key_identifier,
@@ -405,11 +432,11 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
/* Retrieve an inode's encryption policy */
static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
{
- const struct fscrypt_info *ci;
+ const struct fscrypt_inode_info *ci;
union fscrypt_context ctx;
int ret;
- ci = fscrypt_get_info(inode);
+ ci = fscrypt_get_inode_info(inode);
if (ci) {
/* key available, use the cached policy */
*policy = ci->ci_policy;
@@ -647,7 +674,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
/*
* Both parent and child are encrypted, so verify they use the same
- * encryption policy. Compare the fscrypt_info structs if the keys are
+ * encryption policy. Compare the cached policies if the keys are
* available, otherwise retrieve and compare the fscrypt_contexts.
*
* Note that the fscrypt_context retrieval will be required frequently
@@ -717,7 +744,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
*/
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
{
- struct fscrypt_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci = inode->i_crypt_info;
BUILD_BUG_ON(sizeof(union fscrypt_context) !=
FSCRYPT_SET_CONTEXT_MAX_SIZE);
@@ -742,7 +769,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
*/
int fscrypt_set_context(struct inode *inode, void *fs_data)
{
- struct fscrypt_info *ci = inode->i_crypt_info;
+ struct fscrypt_inode_info *ci = inode->i_crypt_info;
union fscrypt_context ctx;
int ctxsize;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 83e57e9f9fa0..5d41765e0c77 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -72,7 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 299c295a27a0..c830261aa883 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb)
}
inode->i_ino = 2;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
@@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
if (!inode)
goto fail;
inode->i_ino = 1;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
inode->i_ino = index + 3;
inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
sprintf(s, "%d", index);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f2ed0c0266cb..c586c5db18b5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -702,6 +702,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
loff_t offset);
-extern const struct xattr_handler *ecryptfs_xattr_handlers[];
+extern const struct xattr_handler * const ecryptfs_xattr_handlers[];
#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 992d9c7e64ae..a25dd3d20008 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1210,7 +1210,7 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
.set = ecryptfs_xattr_set,
};
-const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+const struct xattr_handler * const ecryptfs_xattr_handlers[] = {
&ecryptfs_xattr_handler,
NULL
};
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 59b52718a3a2..7e9961639802 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
inode_unlock(inode);
}
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index db9231f0e77b..76dd3c7295d9 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -25,7 +25,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index e028fafa04f3..996271473609 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -32,10 +32,16 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 storage_space, remaining_space, max_variable_size;
efi_status_t status;
- status = efivar_query_variable_info(attr, &storage_space, &remaining_space,
- &max_variable_size);
- if (status != EFI_SUCCESS)
- return efi_status_to_err(status);
+ /* Some UEFI firmware does not implement QueryVariableInfo() */
+ storage_space = remaining_space = 0;
+ if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) {
+ status = efivar_query_variable_info(attr, &storage_space,
+ &remaining_space,
+ &max_variable_size);
+ if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED)
+ pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n",
+ status);
+ }
/*
* This is not a normal filesystem, so no point in pretending it has a block
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 3789d22ba501..7844ab24b813 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -103,10 +103,9 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid));
i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid));
inode->i_size = be32_to_cpu(efs_inode->di_size);
- inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
- inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
+ inode_set_atime(inode, be32_to_cpu(efs_inode->di_atime), 0);
+ inode_set_mtime(inode, be32_to_cpu(efs_inode->di_mtime), 0);
inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
/* this is the number of blocks in the file */
if (inode->i_size == 0) {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0c2c99c58b5e..f6a0a1748521 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -222,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev;
+ map->m_bdev = dif->bdev_handle->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
@@ -240,7 +240,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev;
+ map->m_bdev = dif->bdev_handle->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
map->m_fscache = dif->fscache;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 73091fbe3ea4..dee10d22ada9 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -217,9 +217,12 @@ again:
strm->buf.out_size = min_t(u32, outlen,
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
- if (!rq->out[no] && rq->fillgaps) /* deduped */
+ if (!rq->out[no] && rq->fillgaps) { /* deduped */
rq->out[no] = erofs_allocpage(pagepool,
GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(rq->out[no],
+ Z_EROFS_SHORTLIVED_PAGE);
+ }
if (rq->out[no])
strm->buf.out = kmap(rq->out[no]) + pageofs;
pageofs = 0;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index edc8ec7581b8..b8ad05b4509d 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -175,7 +175,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4ff88d0dd980..cf04e21bfda2 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -47,7 +47,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct dax_device *dax_dev;
u64 dax_part_off;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 44a24d573f1f..6fd04781fec5 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -227,7 +227,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
void *ptr;
ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -235,7 +235,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(ptr);
dis = ptr + erofs_blkoff(sb, *pos);
- if (!dif->path) {
+ if (!sbi->devs->flatdev && !dif->path) {
if (!dis->tag[0]) {
erofs_err(sb, "empty device tag @ pos %llu", *pos);
return -EINVAL;
@@ -251,13 +251,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(fscache);
dif->fscache = fscache;
} else if (!sbi->devs->flatdev) {
- bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type,
- NULL);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
- NULL, NULL);
+ bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+ sb->s_type, NULL);
+ if (IS_ERR(bdev_handle))
+ return PTR_ERR(bdev_handle);
+ dif->bdev_handle = bdev_handle;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+ &dif->dax_part_off, NULL, NULL);
}
dif->blocks = le32_to_cpu(dis->blocks);
@@ -806,8 +806,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev)
- blkdev_put(dif->bdev, &erofs_fs_type);
+ if (dif->bdev_handle)
+ bdev_release(dif->bdev_handle);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 09d341675e89..b58316b49a43 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -168,7 +168,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
};
#endif
-const struct xattr_handler *erofs_xattr_handlers[] = {
+const struct xattr_handler * const erofs_xattr_handlers[] = {
&erofs_xattr_user_handler,
&erofs_xattr_trusted_handler,
#ifdef CONFIG_EROFS_FS_SECURITY
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index f16283cb8c93..b246cd0e135e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
{
const struct xattr_handler *handler = NULL;
- static const struct xattr_handler *xattr_handler_map[] = {
+ static const struct xattr_handler * const xattr_handler_map[] = {
[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
#ifdef CONFIG_EROFS_FS_POSIX_ACL
[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx,
return xattr_prefix(handler);
}
-extern const struct xattr_handler *erofs_xattr_handlers[];
+extern const struct xattr_handler * const erofs_xattr_handlers[];
int erofs_xattr_prefixes_init(struct super_block *sb);
void erofs_xattr_prefixes_cleanup(struct super_block *sb);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index f55498e5c23d..f78b614f44dc 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -549,6 +549,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 tz, __le16 time, __le16 date, u8 time_cs);
void exfat_truncate_atime(struct timespec64 *ts);
+void exfat_truncate_inode_atime(struct inode *inode);
void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 *tz, __le16 *time, __le16 *date, u8 *time_cs);
u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type);
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 32395ef686a2..30ee2c8d36a5 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -22,7 +22,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
if (err)
return err;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
if (!IS_SYNC(inode))
@@ -290,10 +290,9 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_valid & ATTR_SIZE)
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- setattr_copy(&nop_mnt_idmap, inode, attr);
- exfat_truncate_atime(&inode->i_atime);
+ exfat_truncate_inode_atime(inode);
if (attr->ia_valid & ATTR_SIZE) {
error = exfat_block_truncate_page(inode, attr->ia_size);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 13329baeafbc..a2185e6f0548 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -26,6 +26,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
bool is_dir = (ei->type == TYPE_DIR) ? true : false;
+ struct timespec64 ts;
if (inode->i_ino == EXFAT_ROOT_INO)
return 0;
@@ -55,16 +56,18 @@ int __exfat_write_inode(struct inode *inode, int sync)
&ep->dentry.file.create_time,
&ep->dentry.file.create_date,
&ep->dentry.file.create_time_cs);
- exfat_set_entry_time(sbi, &inode->i_mtime,
- &ep->dentry.file.modify_tz,
- &ep->dentry.file.modify_time,
- &ep->dentry.file.modify_date,
- &ep->dentry.file.modify_time_cs);
- exfat_set_entry_time(sbi, &inode->i_atime,
- &ep->dentry.file.access_tz,
- &ep->dentry.file.access_time,
- &ep->dentry.file.access_date,
- NULL);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.modify_tz,
+ &ep->dentry.file.modify_time,
+ &ep->dentry.file.modify_date,
+ &ep->dentry.file.modify_time_cs);
+ inode_set_mtime_to_ts(inode, ts);
+ exfat_set_entry_time(sbi, &ts,
+ &ep->dentry.file.access_tz,
+ &ep->dentry.file.access_time,
+ &ep->dentry.file.access_date,
+ NULL);
+ inode_set_atime_to_ts(inode, ts);
/* File size should be zero if there is no cluster allocated */
on_disk_size = i_size_read(inode);
@@ -355,7 +358,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
exfat_truncate(inode);
}
}
@@ -398,7 +401,7 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
exfat_write_failed(mapping, pos+len);
if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ei->attr |= ATTR_ARCHIVE;
mark_inode_dirty(inode);
}
@@ -576,10 +579,10 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
exfat_save_attr(inode, info->attr);
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
- inode->i_mtime = info->mtime;
+ inode_set_mtime_to_ts(inode, info->mtime);
inode_set_ctime_to_ts(inode, info->mtime);
ei->i_crtime = info->crtime;
- inode->i_atime = info->atime;
+ inode_set_atime_to_ts(inode, info->atime);
return 0;
}
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index 2e1a1a6b1021..fa8459828046 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -126,6 +126,14 @@ void exfat_truncate_atime(struct timespec64 *ts)
ts->tv_nsec = 0;
}
+void exfat_truncate_inode_atime(struct inode *inode)
+{
+ struct timespec64 atime = inode_get_atime(inode);
+
+ exfat_truncate_atime(&atime);
+ inode_set_atime_to_ts(inode, atime);
+}
+
u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type)
{
int i;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 1b9f587f6cca..b92e46916dea 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -569,7 +569,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -582,8 +582,9 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
+
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -816,16 +817,16 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
- exfat_truncate_atime(&dir->i_atime);
+ simple_inode_init_ts(dir);
+ exfat_truncate_inode_atime(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
mark_inode_dirty(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
@@ -851,7 +852,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -865,8 +866,8 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -977,8 +978,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
- exfat_truncate_atime(&dir->i_atime);
+ simple_inode_init_ts(dir);
+ exfat_truncate_inode_atime(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -986,8 +987,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
@@ -1312,7 +1313,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
inode_inc_iversion(new_dir);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
- exfat_truncate_atime(&new_dir->i_atime);
+ exfat_truncate_inode_atime(new_dir);
if (IS_DIRSYNC(new_dir))
exfat_sync_inode(new_dir);
else
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 2778bd9b631e..e919a68bf4a1 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -370,8 +370,8 @@ static int exfat_read_root(struct inode *inode)
ei->i_size_ondisk = i_size_read(inode);
exfat_save_attr(inode, ATTR_SUBDIR);
- inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode);
- exfat_truncate_atime(&inode->i_atime);
+ ei->i_crtime = simple_inode_init_ts(inode);
+ exfat_truncate_inode_atime(inode);
return 0;
}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index b335f17f682f..c7900868171b 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -468,7 +468,7 @@ int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
ext2_set_de_type(de, inode);
ext2_commit_chunk(page, pos, len);
if (update_times)
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
return ext2_handle_dirsync(dir);
@@ -555,7 +555,7 @@ got_it:
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
ext2_commit_chunk(page, pos, rec_len);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
err = ext2_handle_dirsync(dir);
@@ -606,7 +606,7 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
ext2_commit_chunk(page, pos, to - from);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
return ext2_handle_dirsync(inode);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c24d0de95a83..fdf63e9c6e7c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -546,7 +546,7 @@ got:
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_flags =
ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 314b415ee518..464faf6c217e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1291,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
__ext2_truncate_blocks(inode, newsize);
filemap_invalidate_unlock(inode->i_mapping);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (inode_needs_sync(inode)) {
sync_mapping_buffers(inode->i_mapping);
sync_inode_metadata(inode, 1);
@@ -1412,10 +1412,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
i_gid_write(inode, i_gid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
- inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
+ inode_set_atime(inode, (signed)le32_to_cpu(raw_inode->i_atime), 0);
inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0);
- inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+ inode_set_mtime(inode, (signed)le32_to_cpu(raw_inode->i_mtime), 0);
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
@@ -1544,9 +1543,9 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le32(inode->i_size);
- raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
- raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ raw_inode->i_atime = cpu_to_le32(inode_get_atime_sec(inode));
+ raw_inode->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode));
+ raw_inode->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode));
raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index aaf3e3e88cb2..645ee6142f69 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1572,7 +1572,7 @@ out:
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
inode_inc_iversion(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 20f741184673..e849241ebb8f 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -98,7 +98,7 @@ static struct buffer_head *ext2_xattr_cache_find(struct inode *,
static void ext2_xattr_rehash(struct ext2_xattr_header *,
struct ext2_xattr_entry *);
-static const struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler * const ext2_xattr_handler_map[] = {
[EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -110,7 +110,7 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = {
#endif
};
-const struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler * const ext2_xattr_handlers[] = {
&ext2_xattr_user_handler,
&ext2_xattr_trusted_handler,
#ifdef CONFIG_EXT2_FS_SECURITY
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 7925f596e8e2..6a4966949047 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -72,7 +72,7 @@ extern void ext2_xattr_delete_inode(struct inode *);
extern struct mb_cache *ext2_xattr_create_cache(void);
extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
-extern const struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler * const ext2_xattr_handlers[];
# else /* CONFIG_EXT2_FS_XATTR */
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 453d4da5de52..7ae0b61258a7 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -232,19 +232,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb)
return ext4_has_feature_stable_inodes(sb);
}
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
- *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
const struct fscrypt_operations ext4_cryptops = {
- .key_prefix = "ext4:",
+ .needs_bounce_pages = 1,
+ .has_32bit_inodes = 1,
+ .supports_subblock_data_units = 1,
+ .legacy_key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
.get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
.has_stable_inodes = ext4_has_stable_inodes,
- .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9418359b1d9d..8da5fb680210 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -891,10 +891,13 @@ do { \
(raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \
} while (0)
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
- EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+#define EXT4_INODE_SET_ATIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))
-#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
+#define EXT4_INODE_SET_MTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
@@ -910,9 +913,16 @@ do { \
.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \
})
-#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_GET_ATIME(inode, raw_inode) \
+do { \
+ inode_set_atime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \
+} while (0)
+
+#define EXT4_INODE_GET_MTIME(inode, raw_inode) \
do { \
- (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \
+ inode_set_mtime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \
} while (0)
#define EXT4_INODE_GET_CTIME(inode, raw_inode) \
@@ -1537,7 +1547,7 @@ struct ext4_sb_info {
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *s_journal_bdev;
+ struct bdev_handle *s_journal_bdev_handle;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 202c76996b62..4c4176ee1749 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4481,7 +4481,8 @@ retry:
if (epos > new_size)
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
- inode->i_mtime = inode_get_ctime(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_get_ctime(inode));
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4617,7 +4618,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
@@ -4642,7 +4643,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_mutex;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
@@ -5378,7 +5379,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -5488,7 +5489,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index cdf9bfe10137..11e6f33677a2 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,8 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->s_journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev_handle &&
+ fm->fmr_device ==
+ new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
return true;
return false;
}
@@ -647,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->s_journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev_handle) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->s_journal_bdev->bd_dev);
+ EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b65058d972f9..e9bbb1da2d0a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1250,8 +1250,8 @@ got:
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- ei->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ ei->i_crtime = inode_get_mtime(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 012d9259ff53..9a84a5f9fef4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
return 1;
@@ -1991,7 +1991,7 @@ out:
ext4_orphan_del(handle, inode);
if (err == 0) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4ce35f1c8b0a..08cb5c0e0d51 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4020,7 +4020,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret2))
ret = ret2;
@@ -4180,7 +4180,7 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(err2 && !err))
err = err2;
@@ -4284,8 +4284,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
EXT4_INODE_SET_CTIME(inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
@@ -4893,8 +4893,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
EXT4_INODE_GET_CTIME(inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_GET_ATIME(inode, raw_inode);
+ EXT4_INODE_GET_MTIME(inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
@@ -5019,8 +5019,8 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
spin_lock(&ei->i_raw_lock);
EXT4_INODE_SET_CTIME(inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_INODE_SET_MTIME(inode, raw_inode);
+ EXT4_INODE_SET_ATIME(inode, raw_inode);
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
trace_ext4_other_inode_update_time(inode, orig_ino);
@@ -5413,7 +5413,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
* update c/mtime in shrink case below
*/
if (!shrink)
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
if (shrink)
ext4_fc_track_range(handle, inode,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0bfe2ce589e2..4f931f80cb34 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
unsigned long tmp;
+ struct timespec64 ts1, ts2;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_atime, inode2->i_atime);
- swap(inode1->i_mtime, inode2->i_mtime);
+
+ ts1 = inode_get_atime(inode1);
+ ts2 = inode_get_atime(inode2);
+ inode_set_atime_to_ts(inode1, ts2);
+ inode_set_atime_to_ts(inode2, ts1);
+
+ ts1 = inode_get_mtime(inode1);
+ ts2 = inode_get_mtime(inode2);
+ inode_set_mtime_to_ts(inode1, ts2);
+ inode_set_mtime_to_ts(inode2, ts1);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c91db9f57524..1e599305d85f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
+#include <linux/freezer.h>
#include <trace/events/ext4.h>
/*
@@ -6906,6 +6907,21 @@ __acquires(bitlock)
return ret;
}
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
+ ext4_group_t grp)
+{
+ if (grp < ext4_get_groups_count(sb))
+ return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, grp) - 1) >>
+ EXT4_CLUSTER_BITS(sb);
+}
+
+static bool ext4_trim_interrupted(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
ext4_grpblk_t max, ext4_grpblk_t minblocks)
@@ -6913,9 +6929,12 @@ __acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
ext4_grpblk_t next, count, free_count;
+ bool set_trimmed = false;
void *bitmap;
bitmap = e4b->bd_bitmap;
+ if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
+ set_trimmed = true;
start = max(e4b->bd_info->bb_first_free, start);
count = 0;
free_count = 0;
@@ -6930,16 +6949,14 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
int ret = ext4_trim_extent(sb, start, next - start, e4b);
if (ret && ret != -EOPNOTSUPP)
- break;
+ return count;
count += next - start;
}
free_count += next - start;
start = next + 1;
- if (fatal_signal_pending(current)) {
- count = -ERESTARTSYS;
- break;
- }
+ if (ext4_trim_interrupted())
+ return count;
if (need_resched()) {
ext4_unlock_group(sb, e4b->bd_group);
@@ -6951,6 +6968,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
break;
}
+ if (set_trimmed)
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
+
return count;
}
@@ -6961,7 +6981,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
- * @set_trimmed: set the trimmed flag if at least one block is trimmed
*
* ext4_trim_all_free walks through group's block bitmap searching for free
* extents. When the free extent is found, mark it as used in group buddy
@@ -6971,7 +6990,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks, bool set_trimmed)
+ ext4_grpblk_t minblocks)
{
struct ext4_buddy e4b;
int ret;
@@ -6988,13 +7007,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_lock_group(sb, group);
if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
- minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
- if (ret >= 0 && set_trimmed)
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
- } else {
+ else
ret = 0;
- }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -7027,7 +7043,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
- bool whole_group, eof = false;
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
@@ -7046,10 +7061,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
- if (end >= max_blks - 1) {
+ if (end >= max_blks - 1)
end = max_blks - 1;
- eof = true;
- }
if (end <= first_data_blk)
goto out;
if (start < first_data_blk)
@@ -7063,9 +7076,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
/* end now represents the last cluster to discard in this group */
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
- whole_group = true;
for (group = first_group; group <= last_group; group++) {
+ if (ext4_trim_interrupted())
+ break;
grp = ext4_get_group_info(sb, group);
if (!grp)
continue;
@@ -7082,13 +7096,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
* change it for the last group, note that last_cluster is
* already computed earlier by ext4_get_group_no_and_offset()
*/
- if (group == last_group) {
+ if (group == last_group)
end = last_cluster;
- whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
- }
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen, whole_group);
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 41a6411c600b..057d74467293 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -343,17 +343,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
struct ext4_dir_entry *d, *top;
d = (struct ext4_dir_entry *)bh->b_data;
top = (struct ext4_dir_entry *)(bh->b_data +
- (EXT4_BLOCK_SIZE(inode->i_sb) -
- sizeof(struct ext4_dir_entry_tail)));
- while (d < top && d->rec_len)
+ (blocksize - sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
d = (struct ext4_dir_entry *)(((void *)d) +
- le16_to_cpu(d->rec_len));
+ ext4_rec_len_from_disk(d->rec_len, blocksize));
if (d != top)
return NULL;
@@ -364,7 +364,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
#endif
if (t->det_reserved_zero1 ||
- le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
+ sizeof(struct ext4_dir_entry_tail)) ||
t->det_reserved_zero2 ||
t->det_reserved_ft != EXT4_FT_DIR_CSUM)
return NULL;
@@ -445,13 +446,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
struct ext4_dir_entry *dp;
struct dx_root_info *root;
int count_offset;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
+ unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
- if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ if (rlen == blocksize)
count_offset = 8;
- else if (le16_to_cpu(dirent->rec_len) == 12) {
+ else if (rlen == 12) {
dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
- if (le16_to_cpu(dp->rec_len) !=
- EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
return NULL;
root = (struct dx_root_info *)(((void *)dp + 12));
if (root->reserved_zero ||
@@ -1315,6 +1317,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
unsigned int buflen = bh->b_size;
char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
+ int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
if (ext4_has_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
@@ -1335,11 +1338,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
- map_tail->size = le16_to_cpu(de->rec_len);
+ map_tail->size = ext4_rec_len_from_disk(de->rec_len,
+ blocksize);
count++;
cond_resched();
}
- de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ de = ext4_next_entry(de, blocksize);
}
return count;
}
@@ -2203,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
err2 = ext4_mark_inode_dirty(handle, dir);
@@ -3198,7 +3202,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
@@ -3273,7 +3277,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
@@ -3644,7 +3648,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
inode_inc_iversion(ent->dir);
- ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
+ inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir));
retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3959,7 +3963,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
ext4_dec_count(new.inode);
inode_set_ctime_current(new.inode);
}
- old.dir->i_mtime = inode_set_ctime_current(old.dir);
+ inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
ext4_update_dx_flag(old.dir);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 38217422f938..42a44990d99c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1351,14 +1351,14 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev) {
+ if (sbi->s_journal_bdev_handle) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->s_journal_bdev);
- invalidate_bdev(sbi->s_journal_bdev);
+ sync_blockdev(sbi->s_journal_bdev_handle->bdev);
+ invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -4233,7 +4233,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->s_journal_bdev)
+ if (sbi->s_journal && !sbi->s_journal_bdev_handle)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
@@ -5670,9 +5670,9 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
brelse(sbi->s_sbh);
- if (sbi->s_journal_bdev) {
- invalidate_bdev(sbi->s_journal_bdev);
- blkdev_put(sbi->s_journal_bdev, sb);
+ if (sbi->s_journal_bdev_handle) {
+ invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+ bdev_release(sbi->s_journal_bdev_handle);
}
out_fail:
invalidate_bdev(sb->s_bdev);
@@ -5842,12 +5842,13 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
return journal;
}
-static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
+static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
dev_t j_dev, ext4_fsblk_t *j_start,
ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
@@ -5856,16 +5857,17 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
/* see get_tree_bdev why this is needed and safe */
up_write(&sb->s_umount);
- bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- &fs_holder_ops);
+ bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ sb, &fs_holder_ops);
down_write(&sb->s_umount);
- if (IS_ERR(bdev)) {
+ if (IS_ERR(bdev_handle)) {
ext4_msg(sb, KERN_ERR,
"failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
- return ERR_CAST(bdev);
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
+ return bdev_handle;
}
+ bdev = bdev_handle->bdev;
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
@@ -5912,12 +5914,12 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
*j_start = sb_block + 1;
*j_len = ext4_blocks_count(es);
brelse(bh);
- return bdev;
+ return bdev_handle;
out_bh:
brelse(bh);
out_bdev:
- blkdev_put(bdev, sb);
+ bdev_release(bdev_handle);
return ERR_PTR(errno);
}
@@ -5927,14 +5929,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
journal_t *journal;
ext4_fsblk_t j_start;
ext4_fsblk_t j_len;
- struct block_device *journal_bdev;
+ struct bdev_handle *bdev_handle;
int errno = 0;
- journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
- if (IS_ERR(journal_bdev))
- return ERR_CAST(journal_bdev);
+ bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(bdev_handle))
+ return ERR_CAST(bdev_handle);
- journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
+ journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
j_len, sb->s_blocksize);
if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5951,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
goto out_journal;
}
journal->j_private = sb;
- EXT4_SB(sb)->s_journal_bdev = journal_bdev;
+ EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- blkdev_put(journal_bdev, sb);
+ bdev_release(bdev_handle);
return ERR_PTR(errno);
}
@@ -7127,7 +7129,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
}
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
@@ -7300,12 +7302,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
static void ext4_kill_sb(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+ struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
kill_block_super(sb);
- if (journal_bdev)
- blkdev_put(journal_bdev, sb);
+ if (handle)
+ bdev_release(handle);
}
static struct file_system_type ext4_fs_type = {
@@ -7314,7 +7316,7 @@ static struct file_system_type ext4_fs_type = {
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb = ext4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 92ba28cebac6..82dc5e673d5c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
-const struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler * const ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
@@ -356,7 +356,7 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
- return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
+ return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
(u32) inode_peek_iversion_raw(ea_inode);
}
@@ -368,12 +368,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
{
- return (u32)ea_inode->i_atime.tv_sec;
+ return (u32) inode_get_atime_sec(ea_inode);
}
static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
{
- ea_inode->i_atime.tv_sec = hash;
+ inode_set_atime(ea_inode, hash, 0);
}
/*
@@ -418,7 +418,7 @@ free_bhs:
return ret;
}
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))
static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
u32 ea_inode_hash, struct inode **ea_inode)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 824faf0b15a8..bd97c4aa8177 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -193,7 +193,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
extern void ext4_evict_ea_inode(struct inode *inode);
-extern const struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler * const ext4_xattr_handlers[];
extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8aa29fe2e87b..042593aed1ec 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
de->file_type = fs_umode_to_ftype(inode->i_mode);
set_page_dirty(page);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
f2fs_i_links_write(dir, true);
clear_inode_flag(inode, FI_NEW_INODE);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
}
f2fs_put_page(page, 1);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6d688e42d89c..9043cedfa12b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1234,6 +1234,7 @@ struct f2fs_bio_info {
#define FDEV(i) (sbi->devs[i])
#define RDEV(i) (raw_super->devs[i])
struct f2fs_dev_info {
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
char path[MAX_PATH_LEN];
unsigned int total_segments;
@@ -3317,13 +3318,15 @@ static inline void clear_file(struct inode *inode, int type)
static inline bool f2fs_is_time_consistent(struct inode *inode)
{
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts = inode_get_atime(inode);
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts))
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime))
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts))
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts))
return false;
return true;
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ca5904129b16..dd99abbb7186 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -798,7 +798,7 @@ int f2fs_truncate(struct inode *inode)
if (err)
return err;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
f2fs_mark_inode_dirty_sync(inode, false);
return 0;
}
@@ -905,9 +905,9 @@ static void __setattr_copy(struct mnt_idmap *idmap,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
if (ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
@@ -1012,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return err;
spin_lock(&F2FS_I(inode)->i_size_lock);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
F2FS_I(inode)->last_disk_size = i_size_read(inode);
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
@@ -1840,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode,
}
if (!ret) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
f2fs_mark_inode_dirty_sync(inode, false);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -2888,10 +2888,10 @@ out_src:
if (ret)
goto out_unlock;
- src->i_mtime = inode_set_ctime_current(src);
+ inode_set_mtime_to_ts(src, inode_set_ctime_current(src));
f2fs_mark_inode_dirty_sync(src, false);
if (src != dst) {
- dst->i_mtime = inode_set_ctime_current(dst);
+ inode_set_mtime_to_ts(dst, inode_set_ctime_current(dst));
f2fs_mark_inode_dirty_sync(dst, false);
}
f2fs_update_time(sbi, REQ_TIME);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2fe25619ccb5..ac00423f117b 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -699,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
f2fs_put_page(page, 1);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index cde243840abd..5779c7edd49b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -386,9 +386,9 @@ static void init_idisk_time(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
- fi->i_disk_time[0] = inode->i_atime;
+ fi->i_disk_time[0] = inode_get_atime(inode);
fi->i_disk_time[1] = inode_get_ctime(inode);
- fi->i_disk_time[2] = inode->i_mtime;
+ fi->i_disk_time[2] = inode_get_mtime(inode);
}
static int do_read_inode(struct inode *inode)
@@ -417,12 +417,12 @@ static int do_read_inode(struct inode *inode)
inode->i_size = le64_to_cpu(ri->i_size);
inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
- inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+ inode_set_atime(inode, le64_to_cpu(ri->i_atime),
+ le32_to_cpu(ri->i_atime_nsec));
inode_set_ctime(inode, le64_to_cpu(ri->i_ctime),
le32_to_cpu(ri->i_ctime_nsec));
- inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+ inode_set_mtime(inode, le64_to_cpu(ri->i_mtime),
+ le32_to_cpu(ri->i_mtime_nsec));
inode->i_generation = le32_to_cpu(ri->i_generation);
if (S_ISDIR(inode->i_mode))
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
@@ -698,12 +698,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
}
set_raw_inline(inode, ri);
- ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ri->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ ri->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ ri->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+ ri->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ri->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
if (S_ISDIR(inode->i_mode))
ri->i_current_depth =
cpu_to_le32(F2FS_I(inode)->i_current_depth);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 193b22a2d6bf..d0053b0284d8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -243,8 +243,8 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap,
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- F2FS_I(inode)->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ F2FS_I(inode)->i_crtime = inode_get_mtime(inode);
inode->i_generation = get_random_u32();
if (S_ISDIR(inode->i_mode))
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 7be60df277a5..b56d0f1078a7 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -320,12 +320,12 @@ static int recover_inode(struct inode *inode, struct page *page)
}
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
+ inode_set_atime(inode, le64_to_cpu(raw->i_atime),
+ le32_to_cpu(raw->i_atime_nsec));
inode_set_ctime(inode, le64_to_cpu(raw->i_ctime),
le32_to_cpu(raw->i_ctime_nsec));
- inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode_set_mtime(inode, le64_to_cpu(raw->i_mtime),
+ le32_to_cpu(raw->i_mtime_nsec));
F2FS_I(inode)->i_advise = raw->i_advise;
F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a8c8232852bb..be17d77513d5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1562,7 +1562,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
for (i = 0; i < sbi->s_ndevs; i++) {
if (i > 0)
- blkdev_put(FDEV(i).bdev, sbi->sb);
+ bdev_release(FDEV(i).bdev_handle);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
#endif
@@ -2710,7 +2710,7 @@ retry:
if (len == towrite)
return err;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
f2fs_mark_inode_dirty_sync(inode, false);
return len - towrite;
}
@@ -3203,13 +3203,6 @@ static bool f2fs_has_stable_inodes(struct super_block *sb)
return true;
}
-static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(nid_t);
- *lblk_bits_ret = 8 * sizeof(block_t);
-}
-
static struct block_device **f2fs_get_devices(struct super_block *sb,
unsigned int *num_devs)
{
@@ -3231,13 +3224,15 @@ static struct block_device **f2fs_get_devices(struct super_block *sb,
}
static const struct fscrypt_operations f2fs_cryptops = {
- .key_prefix = "f2fs:",
+ .needs_bounce_pages = 1,
+ .has_32bit_inodes = 1,
+ .supports_subblock_data_units = 1,
+ .legacy_key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
.get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
.has_stable_inodes = f2fs_has_stable_inodes,
- .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
.get_devices = f2fs_get_devices,
};
#endif
@@ -4198,7 +4193,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
for (i = 0; i < max_devices; i++) {
if (i == 0)
- FDEV(0).bdev = sbi->sb->s_bdev;
+ FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
else if (!RDEV(i).path[0])
break;
@@ -4218,13 +4213,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
FDEV(i).end_blk = FDEV(i).start_blk +
(FDEV(i).total_segments <<
sbi->log_blocks_per_seg) - 1;
- FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
- mode, sbi->sb, NULL);
+ FDEV(i).bdev_handle = bdev_open_by_path(
+ FDEV(i).path, mode, sbi->sb, NULL);
}
}
- if (IS_ERR(FDEV(i).bdev))
- return PTR_ERR(FDEV(i).bdev);
+ if (IS_ERR(FDEV(i).bdev_handle))
+ return PTR_ERR(FDEV(i).bdev_handle);
+ FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
/* to release errored devices */
sbi->s_ndevs = i + 1;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index a657284faee3..4314456854f6 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -189,7 +189,7 @@ const struct xattr_handler f2fs_xattr_security_handler = {
.set = f2fs_xattr_generic_set,
};
-static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+static const struct xattr_handler * const f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
[F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
@@ -202,7 +202,7 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
};
-const struct xattr_handler *f2fs_xattr_handlers[] = {
+const struct xattr_handler * const f2fs_xattr_handlers[] = {
&f2fs_xattr_user_handler,
&f2fs_xattr_trusted_handler,
#ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index b1811c392e6f..a005ffdcf717 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -125,7 +125,7 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
extern const struct xattr_handler f2fs_xattr_advise_handler;
extern const struct xattr_handler f2fs_xattr_security_handler;
-extern const struct xattr_handler *f2fs_xattr_handlers[];
+extern const struct xattr_handler * const f2fs_xattr_handlers[];
extern int f2fs_setxattr(struct inode *, int, const char *,
const void *, size_t, struct page *, int);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index cdd39b6020f3..1fac3dabf130 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -512,6 +512,7 @@ static int fat_validate_dir(struct inode *dir)
int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct timespec64 mtime;
int error;
MSDOS_I(inode)->i_pos = 0;
@@ -561,14 +562,18 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
- fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
- inode_set_ctime_to_ts(inode, inode->i_mtime);
+ fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0);
+ inode_set_mtime_to_ts(inode, mtime);
+ inode_set_ctime_to_ts(inode, mtime);
if (sbi->options.isvfat) {
- fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+ struct timespec64 atime;
+
+ fat_time_fat2unix(sbi, &atime, 0, de->adate, 0);
+ inode_set_atime_to_ts(inode, atime);
fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
de->cdate, de->ctime_cs);
} else
- inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime);
+ inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime));
return 0;
}
@@ -849,6 +854,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh;
struct msdos_dir_entry *raw_entry;
+ struct timespec64 mtime;
loff_t i_pos;
sector_t blocknr;
int err, offset;
@@ -882,12 +888,14 @@ retry:
raw_entry->size = cpu_to_le32(inode->i_size);
raw_entry->attr = fat_make_attrs(inode);
fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
- fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+ mtime = inode_get_mtime(inode);
+ fat_time_unix2fat(sbi, &mtime, &raw_entry->time,
&raw_entry->date, NULL);
if (sbi->options.isvfat) {
+ struct timespec64 ts = inode_get_atime(inode);
__le16 atime;
- fat_time_unix2fat(sbi, &inode->i_atime, &atime,
- &raw_entry->adate, NULL);
+
+ fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL);
fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime,
&raw_entry->cdate, &raw_entry->ctime_cs);
}
@@ -1407,7 +1415,8 @@ static int fat_read_root(struct inode *inode)
MSDOS_I(inode)->mmu_private = inode->i_size;
fat_save_attrs(inode, ATTR_DIR);
- inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0)));
set_nlink(inode, fat_subdirs(inode)+2);
return 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index f2304a1054aa..c7a2d27120ba 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -325,15 +325,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
}
if (flags & S_ATIME)
- inode->i_atime = fat_truncate_atime(sbi, now);
+ inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now));
/*
* ctime and mtime share the same on-disk field, and should be
* identical in memory. all mtime updates will be applied to ctime,
* but ctime updates are ignored.
*/
if (flags & S_MTIME)
- inode->i_mtime = inode_set_ctime_to_ts(inode,
- fat_truncate_mtime(sbi, now));
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now)));
return 0;
}
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..5fb0b146e79e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file)
struct files_struct *files = current->files;
struct fdtable *fdt;
+ if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
+ return;
+
rcu_read_lock_sched();
if (unlikely(files->resize_in_progress)) {
@@ -853,8 +856,104 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+ struct file __rcu *file;
+ struct file __rcu *file_reloaded;
+ struct file __rcu *file_reloaded_cmp;
+
+ file = rcu_dereference_raw(*f);
+ if (!file)
+ return NULL;
+
+ if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ return ERR_PTR(-EAGAIN);
+
+ file_reloaded = rcu_dereference_raw(*f);
+
+ /*
+ * Ensure that all accesses have a dependency on the load from
+ * rcu_dereference_raw() above so we get correct ordering
+ * between reuse/allocation and the pointer check below.
+ */
+ file_reloaded_cmp = file_reloaded;
+ OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+ /*
+ * atomic_long_inc_not_zero() above provided a full memory
+ * barrier when we acquired a reference.
+ *
+ * This is paired with the write barrier from assigning to the
+ * __rcu protected file pointer so that if that pointer still
+ * matches the current file, we know we have successfully
+ * acquired a reference to the right file.
+ *
+ * If the pointers don't match the file has been reallocated by
+ * SLAB_TYPESAFE_BY_RCU.
+ */
+ if (file == file_reloaded_cmp)
+ return file_reloaded;
+
+ fput(file);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+ for (;;) {
+ struct file __rcu *file;
+
+ file = __get_file_rcu(f);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
+ continue;
+
+ return file;
+ }
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
+/**
+ * get_file_active - try go get a reference to a file
+ * @f: the file to get a reference on
+ *
+ * In contast to get_file_rcu() the pointer itself isn't part of the
+ * reference counting.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_active(struct file **f)
+{
+ struct file __rcu *file;
+
+ rcu_read_lock();
+ file = __get_file_rcu(f);
+ rcu_read_unlock();
+ if (IS_ERR(file))
+ file = NULL;
+ return file;
+}
+EXPORT_SYMBOL_GPL(get_file_active);
+
static inline struct file *__fget_files_rcu(struct files_struct *files,
- unsigned int fd, fmode_t mask)
+ unsigned int fd, fmode_t mask)
{
for (;;) {
struct file *file;
@@ -865,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
return NULL;
fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
- file = rcu_dereference_raw(*fdentry);
- if (unlikely(!file))
- return NULL;
-
- if (unlikely(file->f_mode & mask))
- return NULL;
/*
* Ok, we have a file pointer. However, because we do
@@ -879,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* Such a race can take two forms:
*
- * (a) the file ref already went down to zero,
- * and get_file_rcu() fails. Just try again:
+ * (a) the file ref already went down to zero and the
+ * file hasn't been reused yet or the file count
+ * isn't zero but the file has already been reused.
*/
- if (unlikely(!get_file_rcu(file)))
+ file = __get_file_rcu(fdentry);
+ if (unlikely(!file))
+ return NULL;
+
+ if (unlikely(IS_ERR(file)))
continue;
/*
@@ -893,13 +991,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* If so, we need to put our ref and try again.
*/
- if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
- unlikely(rcu_dereference_raw(*fdentry) != file)) {
+ if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
fput(file);
continue;
}
/*
+ * This isn't the file we're looking for or we're not
+ * allowed to get a reference to it.
+ */
+ if (unlikely(file->f_mode & mask)) {
+ fput(file);
+ return NULL;
+ }
+
+ /*
* Ok, we have a ref to the file, and checked that it
* still exists.
*/
@@ -948,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+ return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -957,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
task_lock(task);
files = task->files;
if (files)
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
task_unlock(task);
return file;
}
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -974,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
files = task->files;
if (files) {
for (; fd < files_fdtable(files)->max_fds; fd++) {
- file = files_lookup_fd_rcu(files, fd);
+ file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
@@ -983,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
if (unlikely(newfd == oldfd)) { /* corner case */
struct files_struct *files = current->files;
+ struct file *f;
int retval = oldfd;
rcu_read_lock();
- if (!files_lookup_fd_rcu(files, oldfd))
+ f = __fget_files_rcu(files, oldfd, 0);
+ if (!f)
retval = -EBADF;
rcu_read_unlock();
+ if (f)
+ fput(f);
return retval;
}
return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..fa92743ba6a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path real_path;
+ struct path user_path;
};
static inline struct backing_file *backing_file(struct file *f)
@@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f)
return container_of(f, struct backing_file, file);
}
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
{
- return &backing_file(f)->real_path;
+ return &backing_file(f)->user_path;
}
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
-static void file_free_rcu(struct rcu_head *head)
+static inline void file_free(struct file *f)
{
- struct file *f = container_of(head, struct file, f_rcuhead);
-
+ security_file_free(f);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
put_cred(f->f_cred);
- if (unlikely(f->f_mode & FMODE_BACKING))
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ path_put(backing_file_user_path(f));
kfree(backing_file(f));
- else
+ } else {
kmem_cache_free(filp_cachep, f);
+ }
}
-static inline void file_free(struct file *f)
+void release_empty_file(struct file *f)
{
- security_file_free(f);
- if (unlikely(f->f_mode & FMODE_BACKING))
- path_put(backing_file_real_path(f));
- if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- percpu_counter_dec(&nr_files);
- call_rcu(&f->f_rcuhead, file_free_rcu);
+ WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+ if (atomic_long_dec_and_test(&f->f_count)) {
+ security_file_free(f);
+ put_cred(f->f_cred);
+ if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ percpu_counter_dec(&nr_files);
+ kmem_cache_free(filp_cachep, f);
+ }
}
/*
@@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
return error;
}
- atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
@@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
+ /*
+ * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+ * fget-rcu pattern users need to be able to handle spurious
+ * refcount bumps we should reinitialize the reused file first.
+ */
+ atomic_long_set(&f->f_count, 1);
return 0;
}
@@ -471,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ac5d43b164b5..20600e9ea202 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -109,11 +109,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
set_nlink(inode, vip->vii_nlink);
inode->i_size = vip->vii_size;
- inode->i_atime.tv_sec = vip->vii_atime;
+ inode_set_atime(inode, vip->vii_atime, 0);
inode_set_ctime(inode, vip->vii_ctime, 0);
- inode->i_mtime.tv_sec = vip->vii_mtime;
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
+ inode_set_mtime(inode, vip->vii_mtime, 0);
inode->i_blocks = vip->vii_blocks;
inode->i_generation = vip->vii_gen;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 969ce991b0b0..1767493dffda 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -613,6 +613,24 @@ out_free:
kfree(isw);
}
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+ struct list_head *list, int *nr)
+{
+ struct inode *inode;
+
+ list_for_each_entry(inode, list, i_io_list) {
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ continue;
+
+ isw->inodes[*nr] = inode;
+ (*nr)++;
+
+ if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+ return true;
+ }
+ return false;
+}
+
/**
* cleanup_offline_cgwb - detach associated inodes
* @wb: target wb
@@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
- struct inode *inode;
int nr;
bool restart = false;
@@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
nr = 0;
spin_lock(&wb->list_lock);
- list_for_each_entry(inode, &wb->b_attached, i_io_list) {
- if (!inode_prepare_wbs_switch(inode, isw->new_wb))
- continue;
-
- isw->inodes[nr++] = inode;
-
- if (nr >= WB_MAX_INODES_PER_ISW - 1) {
- restart = true;
- break;
- }
- }
+ /*
+ * In addition to the inodes that have completed writeback, also switch
+ * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+ * inodes won't be written back for a long time when lazytime is
+ * enabled, and thus pinning the dying cgwbs. It won't break the
+ * bandwidth restrictions, as writeback of inode metadata is not
+ * accounted for.
+ */
+ restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+ if (!restart)
+ restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
spin_unlock(&wb->list_lock);
/* no attached inodes? bail out */
@@ -1535,10 +1552,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
if (wbc->pages_skipped) {
/*
- * writeback is not making progress due to locked
- * buffers. Skip this inode for now.
+ * Writeback is not making progress due to locked buffers.
+ * Skip this inode for now. Although having skipped pages
+ * is odd for clean inodes, it can happen for some
+ * filesystems so handle that gracefully.
*/
- redirty_tail_locked(inode, wb);
+ if (inode->i_state & I_DIRTY_ALL)
+ redirty_tail_locked(inode, wb);
+ else
+ inode_cgwb_move_to_attached(inode, wb);
return;
}
diff --git a/fs/fs_context.c b/fs/fs_context.c
index a0ad7a0c4680..98589aae5208 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -192,17 +192,19 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key,
EXPORT_SYMBOL(vfs_parse_fs_string);
/**
- * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data
* @fc: The superblock configuration to fill in.
* @data: The data to parse
+ * @sep: callback for separating next option
*
- * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
- * called from the ->monolithic_mount_data() fs_context operation.
+ * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom
+ * option separator callback.
*
* Returns 0 on success or the error returned by the ->parse_option() fs_context
* operation on failure.
*/
-int generic_parse_monolithic(struct fs_context *fc, void *data)
+int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
+ char *(*sep)(char **))
{
char *options = data, *key;
int ret = 0;
@@ -214,7 +216,7 @@ int generic_parse_monolithic(struct fs_context *fc, void *data)
if (ret)
return ret;
- while ((key = strsep(&options, ",")) != NULL) {
+ while ((key = sep(&options)) != NULL) {
if (*key) {
size_t v_len = 0;
char *value = strchr(key, '=');
@@ -233,6 +235,28 @@ int generic_parse_monolithic(struct fs_context *fc, void *data)
return ret;
}
+EXPORT_SYMBOL(vfs_parse_monolithic_sep);
+
+static char *vfs_parse_comma_sep(char **s)
+{
+ return strsep(s, ",");
+}
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @fc: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep);
+}
EXPORT_SYMBOL(generic_parse_monolithic);
/**
diff --git a/fs/fsopen.c b/fs/fsopen.c
index ce03f6521c88..6593ae518115 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -465,6 +465,7 @@ SYSCALL_DEFINE5(fsconfig,
param.file = fget(aux);
if (!param.file)
goto out_key;
+ param.dirfd = aux;
break;
default:
break;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index ab62e4624256..284a35006462 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
inode->i_mode = mode;
inode->i_uid = fc->user_id;
inode->i_gid = fc->group_id;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
/* setting ->i_op to NULL is not allowed */
if (iop)
inode->i_op = iop;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d707e6987da9..d19cbf34c634 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1812,12 +1812,12 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
memset(&outarg, 0, sizeof(outarg));
inarg.valid = FATTR_MTIME;
- inarg.mtime = inode->i_mtime.tv_sec;
- inarg.mtimensec = inode->i_mtime.tv_nsec;
+ inarg.mtime = inode_get_mtime_sec(inode);
+ inarg.mtimensec = inode_get_mtime_nsec(inode);
if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
- inarg.ctime = inode_get_ctime(inode).tv_sec;
- inarg.ctimensec = inode_get_ctime(inode).tv_nsec;
+ inarg.ctime = inode_get_ctime_sec(inode);
+ inarg.ctimensec = inode_get_ctime_nsec(inode);
}
if (ff) {
inarg.valid |= FATTR_FH;
@@ -1956,7 +1956,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
/* the kernel maintains i_mtime locally */
if (trust_local_cmtime) {
if (attr->ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (attr->ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
/* FIXME: clear I_DIRTY_SYNC? */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bf0b85d0b95c..6e6e721f421b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1284,7 +1284,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
size_t size);
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
int fuse_removexattr(struct inode *inode, const char *name);
-extern const struct xattr_handler *fuse_xattr_handlers[];
+extern const struct xattr_handler * const fuse_xattr_handlers[];
struct posix_acl;
struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e4eb7cf26fb..caa8121ad99c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -188,12 +188,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
- inode->i_atime.tv_sec = attr->atime;
- inode->i_atime.tv_nsec = attr->atimensec;
+ inode_set_atime(inode, attr->atime, attr->atimensec);
/* mtime from server may be stale due to local buffered write */
if (!(cache_mask & STATX_MTIME)) {
- inode->i_mtime.tv_sec = attr->mtime;
- inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode_set_mtime(inode, attr->mtime, attr->mtimensec);
}
if (!(cache_mask & STATX_CTIME)) {
inode_set_ctime(inode, attr->ctime, attr->ctimensec);
@@ -276,12 +274,12 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
attr->size = i_size_read(inode);
if (cache_mask & STATX_MTIME) {
- attr->mtime = inode->i_mtime.tv_sec;
- attr->mtimensec = inode->i_mtime.tv_nsec;
+ attr->mtime = inode_get_mtime_sec(inode);
+ attr->mtimensec = inode_get_mtime_nsec(inode);
}
if (cache_mask & STATX_CTIME) {
- attr->ctime = inode_get_ctime(inode).tv_sec;
- attr->ctimensec = inode_get_ctime(inode).tv_nsec;
+ attr->ctime = inode_get_ctime_sec(inode);
+ attr->ctimensec = inode_get_ctime_nsec(inode);
}
if ((attr_version != 0 && fi->attr_version > attr_version) ||
@@ -290,7 +288,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
return;
}
- old_mtime = inode->i_mtime;
+ old_mtime = inode_get_mtime(inode);
fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask);
oldsize = inode->i_size;
@@ -337,8 +335,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
{
inode->i_mode = attr->mode & S_IFMT;
inode->i_size = attr->size;
- inode->i_mtime.tv_sec = attr->mtime;
- inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode_set_mtime(inode, attr->mtime, attr->mtimensec);
inode_set_ctime(inode, attr->ctime, attr->ctimensec);
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
@@ -1423,17 +1420,19 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
const struct fuse_inode *fi)
{
+ struct timespec64 atime = inode_get_atime(&fi->inode);
+ struct timespec64 mtime = inode_get_mtime(&fi->inode);
struct timespec64 ctime = inode_get_ctime(&fi->inode);
*attr = (struct fuse_attr){
.ino = fi->inode.i_ino,
.size = fi->inode.i_size,
.blocks = fi->inode.i_blocks,
- .atime = fi->inode.i_atime.tv_sec,
- .mtime = fi->inode.i_mtime.tv_sec,
+ .atime = atime.tv_sec,
+ .mtime = mtime.tv_sec,
.ctime = ctime.tv_sec,
- .atimensec = fi->inode.i_atime.tv_nsec,
- .mtimensec = fi->inode.i_mtime.tv_nsec,
+ .atimensec = atime.tv_nsec,
+ .mtimensec = mtime.tv_nsec,
.ctimensec = ctime.tv_nsec,
.mode = fi->inode.i_mode,
.nlink = fi->inode.i_nlink,
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 9e6d587b3e67..c66a54d6c7d3 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -476,7 +476,7 @@ retry_locked:
if (!fi->rdc.cached) {
/* Starting cache? Set cache mtime. */
if (!ctx->pos && !fi->rdc.size) {
- fi->rdc.mtime = inode->i_mtime;
+ fi->rdc.mtime = inode_get_mtime(inode);
fi->rdc.iversion = inode_query_iversion(inode);
}
spin_unlock(&fi->rdc.lock);
@@ -488,8 +488,10 @@ retry_locked:
* changed, and reset the cache if so.
*/
if (!ctx->pos) {
+ struct timespec64 mtime = inode_get_mtime(inode);
+
if (inode_peek_iversion(inode) != fi->rdc.iversion ||
- !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) {
+ !timespec64_equal(&fi->rdc.mtime, &mtime)) {
fuse_rdc_reset(inode);
goto retry_locked;
}
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 49c01559580f..5b423fdbb13f 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -209,7 +209,7 @@ static const struct xattr_handler fuse_xattr_handler = {
.set = fuse_xattr_set,
};
-const struct xattr_handler *fuse_xattr_handlers[] = {
+const struct xattr_handler * const fuse_xattr_handlers[] = {
&fuse_xattr_handler,
NULL
};
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index ef7017fb6951..011cd992e0e6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1386,7 +1386,7 @@ static int trunc_start(struct inode *inode, u64 newsize)
ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
i_size_write(inode, newsize);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_dinode_out(ip, dibh->b_data);
if (journaled)
@@ -1583,7 +1583,7 @@ out_unlock:
/* Every transaction boundary, we rewrite the dinode
to keep its di_blocks current in case of failure. */
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1949,7 +1949,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
gfs2_statfs_change(sdp, 0, +btotal, 0);
gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
ip->i_inode.i_gid);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
up_write(&ip->i_rw_mutex);
@@ -1992,7 +1992,7 @@ static int trunc_end(struct gfs2_inode *ip)
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
gfs2_ordered_del_inode(ip);
}
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -2093,7 +2093,7 @@ static int do_grow(struct inode *inode, u64 size)
goto do_end_trans;
truncate_setsize(inode, size);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a2afa88f8be..61ddd03ea111 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
if (ip->i_inode.i_size < offset + size)
i_size_write(&ip->i_inode, offset + size);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -227,7 +227,7 @@ out:
if (ip->i_inode.i_size < offset + copied)
i_size_write(&ip->i_inode, offset + copied);
- ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
+ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode));
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
da->bh = NULL;
brelse(bh);
ip->i_entries++;
- ip->i_inode.i_mtime = tv;
+ inode_set_mtime_to_ts(&ip->i_inode, tv);
if (S_ISDIR(nip->i_inode.i_mode))
inc_nlink(&ip->i_inode);
mark_inode_dirty(inode);
@@ -1911,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
if (!dip->i_entries)
gfs2_consist_inode(dip);
dip->i_entries--;
- dip->i_inode.i_mtime = tv;
+ inode_set_mtime_to_ts(&dip->i_inode, tv);
if (d_is_dir(dentry))
drop_nlink(&dip->i_inode);
mark_inode_dirty(&dip->i_inode);
@@ -1952,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
dent->de_type = cpu_to_be16(new_type);
brelse(bh);
- dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode);
+ inode_set_mtime_to_ts(&dip->i_inode, inode_set_ctime_current(&dip->i_inode));
mark_inode_dirty_sync(&dip->i_inode);
return 0;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9cbf8d98489a..3772a5d9e85c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2010,7 +2010,9 @@ static long gfs2_scan_glock_lru(int nr)
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
if (!spin_trylock(&gl->gl_lockref.lock))
continue;
- if (!gl->gl_lockref.count) {
+ if (gl->gl_lockref.count <= 1 &&
+ (gl->gl_state == LM_ST_UNLOCKED ||
+ demote_ok(gl))) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
freed++;
@@ -2717,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
for(;; i->fd++) {
struct inode *inode;
- i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+ i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
+
inode = file_inode(i->file);
- if (inode->i_sb != i->sb)
- continue;
- if (get_file_rcu(i->file))
+ if (inode->i_sb == i->sb)
break;
+
+ rcu_read_unlock();
+ fput(i->file);
+ rcu_read_lock();
}
rcu_read_unlock();
return i->file;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d26759a98b10..e7d334c277a1 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -403,7 +403,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
const struct gfs2_dinode *str = buf;
- struct timespec64 atime;
+ struct timespec64 atime, iatime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
struct inode *inode = &ip->i_inode;
@@ -433,10 +433,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
- if (timespec64_compare(&inode->i_atime, &atime) < 0)
- inode->i_atime = atime;
- inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
- inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+ iatime = inode_get_atime(inode);
+ if (timespec64_compare(&iatime, &atime) < 0)
+ inode_set_atime_to_ts(inode, atime);
+ inode_set_mtime(inode, be64_to_cpu(str->di_mtime),
+ be32_to_cpu(str->di_mtime_nsec));
inode_set_ctime(inode, be64_to_cpu(str->di_ctime),
be32_to_cpu(str->di_ctime_nsec));
@@ -567,15 +568,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote)
struct super_block *sb = sdp->sd_vfs;
if (!remote ||
- gl->gl_state != LM_ST_SHARED ||
+ (gl->gl_state != LM_ST_SHARED &&
+ gl->gl_state != LM_ST_UNLOCKED) ||
gl->gl_demote_state != LM_ST_UNLOCKED)
return;
/*
* Try to get an active super block reference to prevent racing with
- * unmount (see trylock_super()). But note that unmount isn't the only
- * place where a write lock on s_umount is taken, and we can fail here
- * because of things like remount as well.
+ * unmount (see super_trylock_shared()). But note that unmount isn't
+ * the only place where a write lock on s_umount is taken, and we can
+ * fail here because of things like remount as well.
*/
if (down_read_trylock(&sb->s_umount)) {
atomic_inc(&sb->s_active);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0eac04507904..7fe77bc771e5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -185,8 +185,9 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
/* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */
- inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1);
- inode->i_atime.tv_nsec = 0;
+ inode_set_atime(inode,
+ 1LL << (8 * sizeof(inode_get_atime_sec(inode)) - 1),
+ 0);
glock_set_object(ip->i_gl, ip);
@@ -696,7 +697,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
inode->i_rdev = dev;
inode->i_size = size;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
munge_mode_uid_gid(dip, inode);
check_and_update_goal(dip);
ip->i_goal = dip->i_goal;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 171b2713d2e5..d9854aece15b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -886,7 +886,7 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size)
i_size_write(inode, size);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
set_bit(QDF_REFRESH, &qd->qd_flags);
}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 21ada332d555..1429945215a0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -50,7 +50,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
return ret;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
+ sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
return 0;
ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
if (ret)
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 02d93da21b2b..52a878fa7139 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -410,9 +410,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_nlink = cpu_to_be32(inode->i_nlink);
str->di_size = cpu_to_be64(i_size_read(inode));
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
- str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
- str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec);
+ str->di_atime = cpu_to_be64(inode_get_atime_sec(inode));
+ str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode));
+ str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode));
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -427,9 +427,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_entries = cpu_to_be32(ip->i_entries);
str->di_eattr = cpu_to_be64(ip->i_eattr);
- str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
- str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec);
+ str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode));
+ str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode));
+ str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode));
}
/**
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index ab9c83106932..b4ddf6244586 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -60,8 +60,8 @@ extern const struct export_operations gfs2_export_ops;
extern const struct super_operations gfs2_super_ops;
extern const struct dentry_operations gfs2_dops;
-extern const struct xattr_handler *gfs2_xattr_handlers_max[];
-extern const struct xattr_handler **gfs2_xattr_handlers_min;
+extern const struct xattr_handler * const gfs2_xattr_handlers_max[];
+extern const struct xattr_handler * const *gfs2_xattr_handlers_min;
#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4fea70c0fe3d..79d5c5559512 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1494,7 +1494,7 @@ static const struct xattr_handler gfs2_xattr_trusted_handler = {
.set = gfs2_xattr_set,
};
-const struct xattr_handler *gfs2_xattr_handlers_max[] = {
+const struct xattr_handler * const gfs2_xattr_handlers_max[] = {
/* GFS2_FS_FORMAT_MAX */
&gfs2_xattr_trusted_handler,
@@ -1504,4 +1504,4 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = {
NULL,
};
-const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
+const struct xattr_handler * const *gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1;
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index 6341bb248247..f8395cdd1adf 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -146,7 +146,7 @@ static const struct xattr_handler hfs_type_handler = {
.set = hfs_xattr_set,
};
-const struct xattr_handler *hfs_xattr_handlers[] = {
+const struct xattr_handler * const hfs_xattr_handlers[] = {
&hfs_creator_handler,
&hfs_type_handler,
NULL
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 632c226a3972..d63880e7d9d6 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
goto err1;
dir->i_size++;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
hfs_find_exit(&fd);
return 0;
@@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
}
dir->i_size--;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
res = 0;
out:
@@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
dst_dir->i_size++;
- dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+ inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
mark_inode_dirty(dst_dir);
/* finally remove the old entry */
@@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
src_dir->i_size--;
- src_dir->i_mtime = inode_set_ctime_current(src_dir);
+ inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
mark_inode_dirty(src_dir);
type = entry.type;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 49d02524e667..b5a6ad5df357 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -215,7 +215,7 @@ extern void hfs_evict_inode(struct inode *);
extern void hfs_delete_inode(struct inode *);
/* attr.c */
-extern const struct xattr_handler *hfs_xattr_handlers[];
+extern const struct xattr_handler * const hfs_xattr_handlers[];
/* mdb.c */
extern int hfs_mdb_get(struct super_block *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index ee349b72cfb3..a7bc4690a780 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
HFS_I(inode)->fs_blocks = 0;
@@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_mode |= S_IWUGO;
inode->i_mode &= ~hsb->s_file_umask;
inode->i_mode |= S_IFREG;
- inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
- hfs_m_to_utime(rec->file.MdDat));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat))));
inode->i_op = &hfs_file_inode_operations;
inode->i_fop = &hfs_file_operations;
inode->i_mapping->a_ops = &hfs_aops;
@@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
HFS_I(inode)->fs_blocks = 0;
inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
- inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
- hfs_m_to_utime(rec->dir.MdDat));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat))));
inode->i_op = &hfs_dir_inode_operations;
inode->i_fop = &hfs_dir_operations;
break;
@@ -474,7 +474,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
}
- rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime);
+ rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
rec.dir.Val = cpu_to_be16(inode->i_size - 2);
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
@@ -502,7 +502,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
else
rec.file.Flags |= HFS_FIL_LOCK;
hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
- rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime);
+ rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
@@ -654,7 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
truncate_setsize(inode, attr->ia_size);
hfs_file_truncate(inode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
setattr_copy(&nop_mnt_idmap, inode, attr);
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index dc27d418fbcd..76fa02e3835b 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -28,11 +28,13 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
/* fix up inode on a timezone change */
diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
if (diff) {
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts = inode_get_ctime(inode);
- inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec);
- inode->i_atime.tv_sec += diff;
- inode->i_mtime.tv_sec += diff;
+ inode_set_ctime(inode, ts.tv_sec + diff, ts.tv_nsec);
+ ts = inode_get_atime(inode);
+ inode_set_atime(inode, ts.tv_sec + diff, ts.tv_nsec);
+ ts = inode_get_mtime(inode);
+ inode_set_mtime(inode, ts.tv_sec + diff, ts.tv_nsec);
HFS_I(inode)->tz_secondswest += diff;
}
return 1;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index e71ae2537eaa..1995bafee839 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
dir->i_size++;
if (S_ISDIR(inode->i_mode))
hfsplus_subfolders_inc(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
hfs_find_exit(&fd);
@@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
@@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_size++;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_inc(dst_dir);
- dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
+ inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
/* finally remove the old entry */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
@@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(src_dir);
- src_dir->i_mtime = inode_set_ctime_current(src_dir);
+ inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
/* remove old thread entry */
hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c65c8c4b03dd..702a0663b1d8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap,
}
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
}
setattr_copy(&nop_mnt_idmap, inode, attr);
@@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
inode->i_ino = sbi->next_cnid++;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
hip = HFSPLUS_I(inode);
INIT_LIST_HEAD(&hip->open_dir_list);
@@ -521,8 +521,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
hfsplus_get_perms(inode, &folder->permissions, 1);
set_nlink(inode, 1);
inode->i_size = 2 + be32_to_cpu(folder->valence);
- inode->i_atime = hfsp_mt2ut(folder->access_date);
- inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
+ inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date));
+ inode_set_mtime_to_ts(inode,
+ hfsp_mt2ut(folder->content_mod_date));
inode_set_ctime_to_ts(inode,
hfsp_mt2ut(folder->attribute_mod_date));
HFSPLUS_I(inode)->create_date = folder->create_date;
@@ -563,8 +564,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
init_special_inode(inode, inode->i_mode,
be32_to_cpu(file->permissions.dev));
}
- inode->i_atime = hfsp_mt2ut(file->access_date);
- inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
+ inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date));
+ inode_set_mtime_to_ts(inode,
+ hfsp_mt2ut(file->content_mod_date));
inode_set_ctime_to_ts(inode,
hfsp_mt2ut(file->attribute_mod_date));
HFSPLUS_I(inode)->create_date = file->create_date;
@@ -609,8 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
hfsplus_cat_set_perms(inode, &folder->permissions);
- folder->access_date = hfsp_ut2mt(inode->i_atime);
- folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+ folder->access_date = hfsp_ut2mt(inode_get_atime(inode));
+ folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
folder->valence = cpu_to_be32(inode->i_size - 2);
if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -644,8 +646,8 @@ int hfsplus_cat_write_inode(struct inode *inode)
file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
else
file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
- file->access_date = hfsp_ut2mt(inode->i_atime);
- file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
+ file->access_date = hfsp_ut2mt(inode_get_atime(inode));
+ file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode));
file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 58021e73c00b..9c9ff6b8c6f7 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -13,7 +13,7 @@
static int hfsplus_removexattr(struct inode *inode, const char *name);
-const struct xattr_handler *hfsplus_xattr_handlers[] = {
+const struct xattr_handler * const hfsplus_xattr_handlers[] = {
&hfsplus_xattr_osx_handler,
&hfsplus_xattr_user_handler,
&hfsplus_xattr_trusted_handler,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d14e362b3eba..15cc55e41410 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -17,7 +17,7 @@ extern const struct xattr_handler hfsplus_xattr_user_handler;
extern const struct xattr_handler hfsplus_xattr_trusted_handler;
extern const struct xattr_handler hfsplus_xattr_security_handler;
-extern const struct xattr_handler *hfsplus_xattr_handlers[];
+extern const struct xattr_handler * const hfsplus_xattr_handlers[];
int __hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dc5a5cea5fae..ea87f24c6c3f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -513,10 +513,14 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
set_nlink(ino, st->nlink);
i_uid_write(ino, st->uid);
i_gid_write(ino, st->gid);
- ino->i_atime =
- (struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec };
- ino->i_mtime =
- (struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec };
+ inode_set_atime_to_ts(ino, (struct timespec64){
+ st->atime.tv_sec,
+ st->atime.tv_nsec,
+ });
+ inode_set_mtime_to_ts(ino, (struct timespec64){
+ st->mtime.tv_sec,
+ st->mtime.tv_nsec,
+ });
inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec);
ino->i_size = st->size;
ino->i_blocks = st->blocks;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index f36566d61215..49dd585c2b17 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -277,14 +277,16 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
* inode.
*/
- if (!inode_get_ctime(result).tv_sec) {
+ if (!inode_get_ctime_sec(result)) {
time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date));
inode_set_ctime(result, csec ? csec : 1, 0);
- result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
- result->i_atime.tv_nsec = 0;
+ inode_set_mtime(result,
+ local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)),
+ 0);
+ inode_set_atime(result,
+ local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)),
+ 0);
hpfs_result->i_ea_size = le32_to_cpu(de->ea_size);
if (!hpfs_result->i_ea_mode && de->read_only)
result->i_mode &= ~0222;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 479166378bae..a59e8fa630db 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -37,8 +37,8 @@ void hpfs_init_inode(struct inode *i)
hpfs_inode->i_dirty = 0;
inode_set_ctime(i, 0, 0);
- i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
- i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
+ inode_set_mtime(i, 0, 0);
+ inode_set_atime(i, 0, 0);
}
void hpfs_read_inode(struct inode *i)
@@ -230,9 +230,9 @@ void hpfs_write_inode_nolock(struct inode *i)
}
hpfs_write_inode_ea(i, fnode);
if (de) {
- de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
- de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+ de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+ de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
hpfs_mark_4buffers_dirty(&qbh);
@@ -240,9 +240,9 @@ void hpfs_write_inode_nolock(struct inode *i)
}
if (S_ISDIR(i->i_mode)) {
if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
- de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
- de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
+ de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i)));
+ de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i)));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i)));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
de->file_size = cpu_to_le32(0);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4eb8d6f5989..9184b4584b01 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -12,10 +12,10 @@
static void hpfs_update_directory_times(struct inode *dir)
{
time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
- if (t == dir->i_mtime.tv_sec &&
- t == inode_get_ctime(dir).tv_sec)
+ if (t == inode_get_mtime_sec(dir) &&
+ t == inode_get_ctime_sec(dir))
return;
- dir->i_mtime = inode_set_ctime(dir, t, 0);
+ inode_set_mtime_to_ts(dir, inode_set_ctime(dir, t, 0));
hpfs_write_inode_nolock(dir);
}
@@ -58,8 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
hpfs_i(result)->i_dno = dno;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
result->i_mode |= S_IFDIR;
result->i_op = &hpfs_dir_iops;
@@ -164,8 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
result->i_fop = &hpfs_file_ops;
set_nlink(result, 1);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
if (dee.read_only)
result->i_mode &= ~0222;
@@ -245,8 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
hpfs_init_inode(result);
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
@@ -319,8 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_init_inode(result);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_mtime = result->i_atime =
- inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
+ inode_set_mtime_to_ts(result,
+ inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0)));
hpfs_i(result)->i_ea_size = 0;
result->i_mode = S_IFLNK | 0777;
result->i_uid = current_fsuid();
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 758a51564124..6b0ba3c1efba 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -725,10 +725,12 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
if (!de)
hpfs_error(s, "unable to find root dir");
else {
- root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date));
- root->i_atime.tv_nsec = 0;
- root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
- root->i_mtime.tv_nsec = 0;
+ inode_set_atime(root,
+ local_to_gmt(s, le32_to_cpu(de->read_date)),
+ 0);
+ inode_set_mtime(root,
+ local_to_gmt(s, le32_to_cpu(de->write_date)),
+ 0);
inode_set_ctime(root,
local_to_gmt(s, le32_to_cpu(de->creation_date)),
0);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 316c4cebd3f3..da217eaba102 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -980,7 +980,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
inode->i_mode = S_IFDIR | ctx->mode;
inode->i_uid = ctx->uid;
inode->i_gid = ctx->gid;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -1024,7 +1024,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mapping->private_data = resv_map;
info->seals = F_SEAL_SEAL;
switch (mode & S_IFMT) {
@@ -1067,7 +1067,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (!inode)
return -ENOSPC;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
d_instantiate(dentry, inode);
dget(dentry);/* Extra count - pin the dentry in core */
return 0;
@@ -1099,7 +1099,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
if (!inode)
return -ENOSPC;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
d_tmpfile(file, inode);
return finish_open_simple(file, 0);
}
@@ -1121,7 +1121,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
} else
iput(inode);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
return error;
}
diff --git a/fs/init.c b/fs/init.c
index 9684406a8416..e9387b6c4f30 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mknod(&path, dentry, mode, dev);
if (!error)
error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
@@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
+ mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mkdir(&path, dentry, mode);
if (!error)
error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
diff --git a/fs/inode.c b/fs/inode.c
index 35fd688168c5..4f8984b97df0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1837,27 +1837,29 @@ EXPORT_SYMBOL(bmap);
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
- struct timespec64 ctime;
+ struct timespec64 atime, mtime, ctime;
if (!(mnt->mnt_flags & MNT_RELATIME))
return 1;
/*
* Is mtime younger than or equal to atime? If yes, update atime:
*/
- if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+ atime = inode_get_atime(inode);
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&mtime, &atime) >= 0)
return 1;
/*
* Is ctime younger than or equal to atime? If yes, update atime:
*/
ctime = inode_get_ctime(inode);
- if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
+ if (timespec64_compare(&ctime, &atime) >= 0)
return 1;
/*
* Is the previous atime value older than a day? If yes,
* update atime:
*/
- if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
return 1;
/*
* Good, we can skip the atime update:
@@ -1888,12 +1890,13 @@ int inode_update_timestamps(struct inode *inode, int flags)
if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
now = inode_set_ctime_current(inode);
if (!timespec64_equal(&now, &ctime))
updated |= S_CTIME;
- if (!timespec64_equal(&now, &inode->i_mtime)) {
- inode->i_mtime = now;
+ if (!timespec64_equal(&now, &mtime)) {
+ inode_set_mtime_to_ts(inode, now);
updated |= S_MTIME;
}
if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
@@ -1903,8 +1906,10 @@ int inode_update_timestamps(struct inode *inode, int flags)
}
if (flags & S_ATIME) {
- if (!timespec64_equal(&now, &inode->i_atime)) {
- inode->i_atime = now;
+ struct timespec64 atime = inode_get_atime(inode);
+
+ if (!timespec64_equal(&now, &atime)) {
+ inode_set_atime_to_ts(inode, now);
updated |= S_ATIME;
}
}
@@ -1963,7 +1968,7 @@ EXPORT_SYMBOL(inode_update_time);
bool atime_needs_update(const struct path *path, struct inode *inode)
{
struct vfsmount *mnt = path->mnt;
- struct timespec64 now;
+ struct timespec64 now, atime;
if (inode->i_flags & S_NOATIME)
return false;
@@ -1989,7 +1994,8 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
if (!relatime_need_update(mnt, inode, now))
return false;
- if (timespec64_equal(&inode->i_atime, &now))
+ atime = inode_get_atime(inode);
+ if (timespec64_equal(&atime, &now))
return false;
return true;
@@ -2006,7 +2012,7 @@ void touch_atime(const struct path *path)
if (!sb_start_write_trylock(inode->i_sb))
return;
- if (__mnt_want_write(mnt) != 0)
+ if (mnt_get_write_access(mnt) != 0)
goto skip_update;
/*
* File systems can error out when updating inodes if they need to
@@ -2018,7 +2024,7 @@ void touch_atime(const struct path *path)
* of the fs read only, e.g. subvolumes in Btrfs.
*/
inode_update_time(inode, S_ATIME);
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
skip_update:
sb_end_write(inode->i_sb);
}
@@ -2102,63 +2108,22 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
-/**
- * current_mgtime - Return FS time (possibly fine-grained)
- * @inode: inode.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
- * as having been QUERIED, get a fine-grained timestamp.
- */
-struct timespec64 current_mgtime(struct inode *inode)
-{
- struct timespec64 now, ctime;
- atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
- long nsec = atomic_long_read(pnsec);
-
- if (nsec & I_CTIME_QUERIED) {
- ktime_get_real_ts64(&now);
- return timestamp_truncate(now, inode);
- }
-
- ktime_get_coarse_real_ts64(&now);
- now = timestamp_truncate(now, inode);
-
- /*
- * If we've recently fetched a fine-grained timestamp
- * then the coarse-grained one may still be earlier than the
- * existing ctime. Just keep the existing value if so.
- */
- ctime = inode_get_ctime(inode);
- if (timespec64_compare(&ctime, &now) > 0)
- now = ctime;
-
- return now;
-}
-EXPORT_SYMBOL(current_mgtime);
-
-static struct timespec64 current_ctime(struct inode *inode)
-{
- if (is_mgtime(inode))
- return current_mgtime(inode);
- return current_time(inode);
-}
-
static int inode_needs_update_time(struct inode *inode)
{
int sync_it = 0;
- struct timespec64 now = current_ctime(inode);
- struct timespec64 ctime;
+ struct timespec64 now = current_time(inode);
+ struct timespec64 ts;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- if (!timespec64_equal(&inode->i_mtime, &now))
+ ts = inode_get_mtime(inode);
+ if (!timespec64_equal(&ts, &now))
sync_it = S_MTIME;
- ctime = inode_get_ctime(inode);
- if (!timespec64_equal(&ctime, &now))
+ ts = inode_get_ctime(inode);
+ if (!timespec64_equal(&ts, &now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2173,9 +2138,9 @@ static int __file_update_time(struct file *file, int sync_mode)
struct inode *inode = file_inode(file);
/* try to update time settings */
- if (!__mnt_want_write_file(file)) {
+ if (!mnt_get_write_access_file(file)) {
ret = inode_update_time(inode, sync_mode);
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
}
return ret;
@@ -2578,43 +2543,9 @@ EXPORT_SYMBOL(current_time);
*/
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
- struct timespec64 now;
- struct timespec64 ctime;
-
- ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec);
- if (!(ctime.tv_nsec & I_CTIME_QUERIED)) {
- now = current_time(inode);
+ struct timespec64 now = current_time(inode);
- /* Just copy it into place if it's not multigrain */
- if (!is_mgtime(inode)) {
- inode_set_ctime_to_ts(inode, now);
- return now;
- }
-
- /*
- * If we've recently updated with a fine-grained timestamp,
- * then the coarse-grained one may still be earlier than the
- * existing ctime. Just keep the existing value if so.
- */
- ctime.tv_sec = inode->__i_ctime.tv_sec;
- if (timespec64_compare(&ctime, &now) > 0)
- return ctime;
-
- /*
- * Ctime updates are usually protected by the inode_lock, but
- * we can still race with someone setting the QUERIED flag.
- * Try to swap the new nsec value into place. If it's changed
- * in the interim, then just go with a fine-grained timestamp.
- */
- if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec,
- now.tv_nsec) != ctime.tv_nsec)
- goto fine_grained;
- inode->__i_ctime.tv_sec = now.tv_sec;
- return now;
- }
-fine_grained:
- ktime_get_real_ts64(&now);
- inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode));
+ inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..58e43341aebf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);
@@ -94,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void release_empty_file(struct file *f);
+
+static inline void file_put_write_access(struct file *file)
+{
+ put_write_access(file->f_inode);
+ mnt_put_write_access(file->f_path.mnt);
+ if (unlikely(file->f_mode & FMODE_BACKING))
+ mnt_put_write_access(backing_file_user_path(file)->mnt);
+}
static inline void put_file_access(struct file *file)
{
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_dec(file->f_inode);
} else if (file->f_mode & FMODE_WRITER) {
- put_write_access(file->f_inode);
- __mnt_drop_write(file->f_path.mnt);
+ file_put_write_access(file);
}
}
@@ -130,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
* mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
* cleared, it will see s_readonly_remount set.
* For RW->RO transition, the barrier pairs with the barrier in
- * __mnt_want_write() before the mnt_is_readonly() check. The barrier
- * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
- * cleared, it will see s_readonly_remount set.
+ * mnt_get_write_access() before the mnt_is_readonly() check.
+ * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+ * already cleared, it will see s_readonly_remount set.
*/
smp_wmb();
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ae8673ce08b1..2bc0aa23fde3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -640,11 +640,13 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t poff, plen;
/*
- * If the write completely overlaps the current folio, then
+ * If the write or zeroing completely overlaps the current folio, then
* entire folio will be dirtied so there is no need for
* per-block state tracking structures to be attached to this folio.
+ * For the unshare case, we must read in the ondisk contents because we
+ * are not changing pagecache contents.
*/
- if (pos <= folio_pos(folio) &&
+ if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
pos + len >= folio_pos(folio) + folio_size(folio))
return 0;
@@ -879,8 +881,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
+ bytes = iov_iter_count(i);
+retry:
offset = pos & (chunk - 1);
- bytes = min(chunk - offset, iov_iter_count(i));
+ bytes = min(chunk - offset, bytes);
status = balance_dirty_pages_ratelimited_flags(mapping,
bdp_flags);
if (unlikely(status))
@@ -931,10 +935,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
- if (copied)
- bytes = copied;
if (chunk > PAGE_SIZE)
chunk /= 2;
+ if (copied) {
+ bytes = copied;
+ goto retry;
+ }
} else {
pos += status;
written += status;
@@ -1047,7 +1053,7 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
/*
* Scan the data range passed to us for dirty page cache folios. If we find a
- * dirty folio, punch out the preceeding range and update the offset from which
+ * dirty folio, punch out the preceding range and update the offset from which
* the next punch will start from.
*
* We can punch out storage reservations under clean pages because they either
@@ -1261,7 +1267,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
- long status = 0;
loff_t written = 0;
/* don't bother with blocks that are not shared to start with */
@@ -1272,28 +1277,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
return length;
do {
- unsigned long offset = offset_in_page(pos);
- unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
struct folio *folio;
+ int status;
+ size_t offset;
+ size_t bytes = min_t(u64, SIZE_MAX, length);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
- if (iter->iomap.flags & IOMAP_F_STALE)
+ if (iomap->flags & IOMAP_F_STALE)
break;
- status = iomap_write_end(iter, pos, bytes, bytes, folio);
- if (WARN_ON_ONCE(status == 0))
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
+
+ bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ if (WARN_ON_ONCE(bytes == 0))
return -EIO;
cond_resched();
- pos += status;
- written += status;
- length -= status;
+ pos += bytes;
+ written += bytes;
+ length -= bytes;
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- } while (length);
+ } while (length > 0);
return written;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 2ee21286ac8f..3e4d53e26f94 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1422,8 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_ino, de->flags[-high_sierra]);
}
#endif
- inode->i_mtime = inode->i_atime =
- inode_set_ctime(inode, iso_date(de->date, high_sierra), 0);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0)));
ei->i_first_extent = (isonum_733(de->extent) +
isonum_711(de->ext_attr_length));
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 348783a70f57..d6c17ad69dee 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -426,16 +426,14 @@ repeat:
0);
}
if (rr->u.TF.flags & TF_MODIFY) {
- inode->i_mtime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_mtime.tv_nsec = 0;
+ inode_set_mtime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
if (rr->u.TF.flags & TF_ACCESS) {
- inode->i_atime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_atime.tv_nsec = 0;
+ inode_set_atime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
if (rr->u.TF.flags & TF_ATTRIBUTES) {
inode_set_ctime(inode,
@@ -531,9 +529,9 @@ repeat:
inode->i_rdev = reloc->i_rdev;
inode->i_size = reloc->i_size;
inode->i_blocks = reloc->i_blocks;
- inode->i_atime = reloc->i_atime;
+ inode_set_atime_to_ts(inode, inode_get_atime(reloc));
inode_set_ctime_to_ts(inode, inode_get_ctime(reloc));
- inode->i_mtime = reloc->i_mtime;
+ inode_set_mtime_to_ts(inode, inode_get_mtime(reloc));
iput(reloc);
break;
#ifdef CONFIG_ZISOFS
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1073259902a6..8d6f934c3d95 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -298,14 +298,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
{
- struct page *page = bh->b_page;
char *addr;
__u32 checksum;
- addr = kmap_atomic(page);
- checksum = crc32_be(crc32_sum,
- (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
- kunmap_atomic(addr);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
+ checksum = crc32_be(crc32_sum, addr, bh->b_size);
+ kunmap_local(addr);
return checksum;
}
@@ -322,7 +320,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
- struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
__be32 seq;
@@ -331,11 +328,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
return;
seq = cpu_to_be32(sequence);
- addr = kmap_atomic(page);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
- bh->b_size);
- kunmap_atomic(addr);
+ csum32 = jbd2_chksum(j, csum32, addr, bh->b_size);
+ kunmap_local(addr);
if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 768fa05bcbed..30dec2bd2ecc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1601,6 +1601,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
err_cleanup:
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+ if (journal->j_chksum_driver)
+ crypto_free_shash(journal->j_chksum_driver);
kfree(journal->j_wbuf);
jbd2_journal_destroy_revoke(journal);
journal_fail_superblock(journal);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4d1fda1f7143..5f08b5fd105a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -935,19 +935,15 @@ static void warn_dirty_buffer(struct buffer_head *bh)
/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
static void jbd2_freeze_jh_data(struct journal_head *jh)
{
- struct page *page;
- int offset;
char *source;
struct buffer_head *bh = jh2bh(jh);
J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
- page = bh->b_page;
- offset = offset_in_page(bh->b_data);
- source = kmap_atomic(page);
+ source = kmap_local_folio(bh->b_folio, bh_offset(bh));
/* Fire data frozen trigger just before we copy the data */
- jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
- memcpy(jh->b_frozen_data, source + offset, bh->b_size);
- kunmap_atomic(source);
+ jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
+ memcpy(jh->b_frozen_data, source, bh->b_size);
+ kunmap_local(source);
/*
* Now that the frozen data is saved off, we need to store any matching
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 091ab0eaabbe..2b2938970da3 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -204,8 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i,
if (ret)
goto fail;
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(ri->ctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(ri->ctime))));
jffs2_free_raw_inode(ri);
@@ -238,7 +238,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
if (dead_f->inocache)
set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
if (!ret)
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(now)));
return ret;
}
/***********************************************************************/
@@ -272,7 +273,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
mutex_unlock(&f->sem);
d_instantiate(dentry, d_inode(old_dentry));
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(now)));
ihold(d_inode(old_dentry));
}
return ret;
@@ -423,8 +425,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(rd->mctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
jffs2_free_raw_dirent(rd);
@@ -568,8 +570,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(rd->mctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
inc_nlink(dir_i);
jffs2_free_raw_dirent(rd);
@@ -610,7 +612,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
dentry->d_name.len, f, now);
if (!ret) {
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(now)));
clear_nlink(d_inode(dentry));
drop_nlink(dir_i);
}
@@ -746,8 +749,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
- ITIME(je32_to_cpu(rd->mctime)));
+ inode_set_mtime_to_ts(dir_i,
+ inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime))));
jffs2_free_raw_dirent(rd);
@@ -868,16 +871,18 @@ static int jffs2_rename (struct mnt_idmap *idmap,
* caller won't do it on its own since we are returning an error.
*/
d_invalidate(new_dentry);
- new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i,
- ITIME(now));
+ inode_set_mtime_to_ts(new_dir_i,
+ inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
return ret;
}
if (d_is_dir(old_dentry))
drop_nlink(old_dir_i);
- old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now));
- new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now));
+ inode_set_mtime_to_ts(old_dir_i,
+ inode_set_ctime_to_ts(old_dir_i, ITIME(now)));
+ inode_set_mtime_to_ts(new_dir_i,
+ inode_set_ctime_to_ts(new_dir_i, ITIME(now)));
return 0;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 11c66793960e..62ea76da7fdf 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -317,8 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
inode->i_size = pos + writtenlen;
inode->i_blocks = (inode->i_size + 511) >> 9;
- inode->i_mtime = inode_set_ctime_to_ts(inode,
- ITIME(je32_to_cpu(ri->ctime)));
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))));
}
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 0403efab4089..d175cccb7c55 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -113,8 +113,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
- ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
- ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
+ ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode_get_atime(inode)));
+ ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode_get_mtime(inode)));
ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode)));
ri->offset = cpu_to_je32(0);
@@ -147,9 +147,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
return PTR_ERR(new_metadata);
}
/* It worked. Update the inode */
- inode->i_atime = ITIME(je32_to_cpu(ri->atime));
+ inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(ri->atime)));
inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)));
- inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
+ inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(ri->mtime)));
inode->i_mode = jemode_to_cpu(ri->mode);
i_uid_write(inode, je16_to_cpu(ri->uid));
i_gid_write(inode, je16_to_cpu(ri->gid));
@@ -282,8 +282,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
i_uid_write(inode, je16_to_cpu(latest_node.uid));
i_gid_write(inode, je16_to_cpu(latest_node.gid));
inode->i_size = je32_to_cpu(latest_node.isize);
- inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
- inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
+ inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(latest_node.atime)));
+ inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(latest_node.mtime)));
inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime)));
set_nlink(inode, f->inocache->pino_nlink);
@@ -386,8 +386,8 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
iattr.ia_mode = inode->i_mode;
iattr.ia_uid = inode->i_uid;
iattr.ia_gid = inode->i_gid;
- iattr.ia_atime = inode->i_atime;
- iattr.ia_mtime = inode->i_mtime;
+ iattr.ia_atime = inode_get_atime(inode);
+ iattr.ia_mtime = inode_get_mtime(inode);
iattr.ia_ctime = inode_get_ctime(inode);
jffs2_do_setattr(inode, &iattr);
@@ -475,8 +475,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_mode = jemode_to_cpu(ri->mode);
i_gid_write(inode, je16_to_cpu(ri->gid));
i_uid_write(inode, je16_to_cpu(ri->uid));
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
- ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
+ simple_inode_init_ts(inode);
+ ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode_get_mtime(inode)));
inode->i_blocks = 0;
inode->i_size = 0;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 50727a1ff931..86ab014a349c 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -36,8 +36,8 @@ struct kvec;
#define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds())
#define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec)
#define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime)
-#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime)
+#define JFFS2_F_I_MTIME(f) I_SEC(inode_get_mtime(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_ATIME(f) I_SEC(inode_get_atime(OFNI_EDONI_2SFFJ(f)))
#define sleep_on_spinunlock(wq, s) \
do { \
DECLARE_WAITQUEUE(__wait, current); \
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3b6bdc9a49e1..00224f3a8d6e 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -920,7 +920,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
* do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
* is an implementation of setxattr handler on jffs2.
* -------------------------------------------------- */
-const struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler * const jffs2_xattr_handlers[] = {
&jffs2_user_xattr_handler,
#ifdef CONFIG_JFFS2_FS_SECURITY
&jffs2_security_xattr_handler,
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 1b5030a3349d..7e7de093ec0a 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -94,7 +94,7 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
const char *buffer, size_t size, int flags);
-extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler * const jffs2_xattr_handlers[];
extern const struct xattr_handler jffs2_user_xattr_handler;
extern const struct xattr_handler jffs2_trusted_xattr_handler;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 920d58a1566b..1a6b5921d17a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
break;
}
- ip->i_mtime = inode_set_ctime_current(ip);
+ inode_set_mtime_to_ts(ip, inode_set_ctime_current(ip));
mark_inode_dirty(ip);
txCommit(tid, 1, &ip, 0);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 923a58422c46..8e87264e56ce 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3061,10 +3061,10 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
}
ip->i_size = le64_to_cpu(dip->di_size);
- ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
- ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
- ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
- ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
+ inode_set_atime(ip, le32_to_cpu(dip->di_atime.tv_sec),
+ le32_to_cpu(dip->di_atime.tv_nsec));
+ inode_set_mtime(ip, le32_to_cpu(dip->di_mtime.tv_sec),
+ le32_to_cpu(dip->di_mtime.tv_nsec));
inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec),
le32_to_cpu(dip->di_ctime.tv_nsec));
ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
@@ -3138,12 +3138,12 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
else /* Leave the original permissions alone */
dip->di_mode = cpu_to_le32(jfs_ip->mode2);
- dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
- dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
- dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec);
- dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec);
- dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
- dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
+ dip->di_atime.tv_sec = cpu_to_le32(inode_get_atime_sec(ip));
+ dip->di_atime.tv_nsec = cpu_to_le32(inode_get_atime_nsec(ip));
+ dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime_sec(ip));
+ dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime_nsec(ip));
+ dip->di_mtime.tv_sec = cpu_to_le32(inode_get_mtime_sec(ip));
+ dip->di_mtime.tv_nsec = cpu_to_le32(inode_get_mtime_nsec(ip));
dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */
dip->di_acl = jfs_ip->acl; /* as are dxd's */
dip->di_ea = jfs_ip->ea;
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 87594efa7f7c..f10f295d1502 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_inode->mode2 |= inode->i_mode;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- jfs_inode->otime = inode_get_ctime(inode).tv_sec;
+ simple_inode_init_ts(inode);
+ jfs_inode->otime = inode_get_ctime_sec(inode);
inode->i_generation = JFS_SBI(sb)->gengen++;
jfs_inode->cflag = 0;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e855b8fde76c..cb6d1fda66a7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
int lmLogOpen(struct super_block *sb)
{
int rc;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
struct jfs_log *log;
struct jfs_sb_info *sbi = JFS_SBI(sb);
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
mutex_lock(&jfs_log_mutex);
list_for_each_entry(log, &jfs_external_logs, journal_list) {
- if (log->bdev->bd_dev == sbi->logdev) {
+ if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
jfs_warn("wrong uuid on JFS journal");
mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
* file systems to log may have n-to-1 relationship;
*/
- bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
- log, NULL);
- if (IS_ERR(bdev)) {
- rc = PTR_ERR(bdev);
+ bdev_handle = bdev_open_by_dev(sbi->logdev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
+ if (IS_ERR(bdev_handle)) {
+ rc = PTR_ERR(bdev_handle);
goto free;
}
- log->bdev = bdev;
+ log->bdev_handle = bdev_handle;
uuid_copy(&log->uuid, &sbi->loguuid);
/*
@@ -1141,7 +1141,7 @@ journal_found:
lbmLogShutdown(log);
close: /* close external log device */
- blkdev_put(bdev, log);
+ bdev_release(bdev_handle);
free: /* free log descriptor */
mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
init_waitqueue_head(&log->syncwait);
set_bit(log_INLINELOG, &log->flag);
- log->bdev = sb->s_bdev;
+ log->bdev_handle = sb->s_bdev_handle;
log->base = addressPXD(&JFS_SBI(sb)->logpxd);
log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
(L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
struct jfs_log *log = sbi->log;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
int rc = 0;
jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
* external log as separate logical volume
*/
list_del(&log->journal_list);
- bdev = log->bdev;
+ bdev_handle = log->bdev_handle;
rc = lmLogShutdown(log);
- blkdev_put(bdev, log);
+ bdev_release(bdev_handle);
kfree(log);
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bp->l_flag |= lbmREAD;
- bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS);
+ bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2110,10 +2110,15 @@ static void lbmStartIO(struct lbuf * bp)
{
struct bio *bio;
struct jfs_log *log = bp->l_log;
+ struct block_device *bdev = NULL;
jfs_info("lbmStartIO");
- bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
+ if (!log->no_integrity)
+ bdev = log->bdev_handle->bdev;
+
+ bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
+ GFP_NOFS);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 805877ce5020..84aa2d253907 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
* before writing syncpt.
*/
struct list_head journal_list; /* Global list */
- struct block_device *bdev; /* 4: log lv pointer */
+ struct bdev_handle *bdev_handle; /* 4: log lv pointer */
int serial; /* 4: log mount serial number */
s64 base; /* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index b83aae56a1f2..415eb65a36ff 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -430,7 +430,8 @@ int updateSuper(struct super_block *sb, uint state)
if (state == FM_MOUNT) {
/* record log's dev_t and mount serial number */
- j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
+ j_sb->s_logdev = cpu_to_le32(
+ new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
} else if (state == FM_CLEAN) {
/*
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 0d33816d251d..ec67d8554d2c 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -46,7 +46,7 @@ extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *,
extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
-extern const struct xattr_handler *jfs_xattr_handlers[];
+extern const struct xattr_handler * const jfs_xattr_handlers[];
#ifdef CONFIG_JFS_SECURITY
extern int jfs_init_security(tid_t, struct inode *, struct inode *,
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 57d7a4300210..d68a4e6ac345 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
mark_inode_dirty(dip);
@@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
/* update parent directory inode */
inc_nlink(dip); /* for '..' from child directory */
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
mark_inode_dirty(dip);
rc = txCommit(tid, 2, &iplist[0], 0);
@@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
/* update parent directory's link count corresponding
* to ".." entry of the target directory deleted
*/
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
inode_dec_link_count(dip);
/*
@@ -512,7 +512,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
ASSERT(ip->i_nlink);
- dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip));
+ inode_set_mtime_to_ts(dip,
+ inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip)));
mark_inode_dirty(dip);
/* update target's inode */
@@ -828,7 +829,7 @@ static int jfs_link(struct dentry *old_dentry,
/* update object inode */
inc_nlink(ip); /* for new link */
inode_set_ctime_current(ip);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
ihold(ip);
@@ -1028,7 +1029,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_mtime = inode_set_ctime_current(dip);
+ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip));
mark_inode_dirty(dip);
/*
* commit update of parent directory and link object
@@ -1271,7 +1272,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inode_set_ctime_current(old_ip);
mark_inode_dirty(old_ip);
- new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
mark_inode_dirty(new_dir);
/* Build list of inodes modified by this transaction */
@@ -1283,7 +1284,8 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (old_dir != new_dir) {
iplist[ipcount++] = new_dir;
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir,
+ inode_set_ctime_current(old_dir));
mark_inode_dirty(old_dir);
}
@@ -1416,7 +1418,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mark_inode_dirty(ip);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2e2f7f6d36a0..966826c394ee 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -818,7 +818,7 @@ out:
}
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
inode_unlock(inode);
return len - towrite;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 8577ad494e05..0fb7afac298e 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -985,7 +985,7 @@ static const struct xattr_handler jfs_trusted_xattr_handler = {
.set = jfs_xattr_set,
};
-const struct xattr_handler *jfs_xattr_handlers[] = {
+const struct xattr_handler * const jfs_xattr_handlers[] = {
&jfs_os2_xattr_handler,
&jfs_user_xattr_handler,
&jfs_security_xattr_handler,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 922719a343a7..b83054da68b3 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -151,7 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
static inline void set_inode_attr(struct inode *inode,
@@ -159,8 +159,8 @@ static inline void set_inode_attr(struct inode *inode,
{
inode->i_uid = attrs->ia_uid;
inode->i_gid = attrs->ia_gid;
- inode->i_atime = attrs->ia_atime;
- inode->i_mtime = attrs->ia_mtime;
+ inode_set_atime_to_ts(inode, attrs->ia_atime);
+ inode_set_mtime_to_ts(inode, attrs->ia_mtime);
inode_set_ctime_to_ts(inode, attrs->ia_ctime);
}
@@ -445,7 +445,7 @@ static const struct xattr_handler kernfs_user_xattr_handler = {
.set = kernfs_vfs_user_xattr_set,
};
-const struct xattr_handler *kernfs_xattr_handlers[] = {
+const struct xattr_handler * const kernfs_xattr_handlers[] = {
&kernfs_trusted_xattr_handler,
&kernfs_security_xattr_handler,
&kernfs_user_xattr_handler,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index a9b854cdfdb5..237f2764b941 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -127,7 +127,7 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
*/
-extern const struct xattr_handler *kernfs_xattr_handlers[];
+extern const struct xattr_handler * const kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask);
diff --git a/fs/libfs.c b/fs/libfs.c
index a4eb12757886..abe2b5a40ba1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -541,7 +541,8 @@ void simple_recursive_removal(struct dentry *dentry,
dput(victim); // unpin it
}
if (victim == dentry) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
if (d_is_dir(dentry))
drop_nlink(inode);
inode_unlock(inode);
@@ -582,7 +583,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_atime = root->i_mtime = inode_set_ctime_current(root);
+ simple_inode_init_ts(root);
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
@@ -638,8 +639,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
{
struct inode *inode = d_inode(old_dentry);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inc_nlink(inode);
ihold(inode);
dget(dentry);
@@ -673,8 +674,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
drop_nlink(inode);
dput(dentry);
return 0;
@@ -709,9 +710,10 @@ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
{
struct inode *newino = d_inode(new_dentry);
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
if (new_dir != old_dir)
- new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_mtime_to_ts(new_dir,
+ inode_set_ctime_current(new_dir));
inode_set_ctime_current(d_inode(old_dentry));
if (newino)
inode_set_ctime_current(newino);
@@ -926,7 +928,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
@@ -952,7 +954,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
goto out;
}
inode->i_mode = S_IFREG | files->mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_fop = files->ops;
inode->i_ino = i;
d_add(dentry, inode);
@@ -1520,7 +1522,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
@@ -1903,6 +1905,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
* We don't know how much we wrote, so just return the number of
* bytes which were direct-written
*/
+ iocb->ki_pos -= buffered_written;
if (direct_written)
return direct_written;
return err;
@@ -1911,3 +1914,20 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
return direct_written + buffered_written;
}
EXPORT_SYMBOL_GPL(direct_write_fallback);
+
+/**
+ * simple_inode_init_ts - initialize the timestamps for a new inode
+ * @inode: inode to be initialized
+ *
+ * When a new inode is created, most filesystems set the timestamps to the
+ * current time. Add a helper to do this.
+ */
+struct timespec64 simple_inode_init_ts(struct inode *inode)
+{
+ struct timespec64 ts = inode_set_ctime_current(inode);
+
+ inode_set_atime_to_ts(inode, ts);
+ inode_set_mtime_to_ts(inode, ts);
+ return ts;
+}
+EXPORT_SYMBOL(simple_inode_init_ts);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 6579948070a4..81be07c1d3d1 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -24,7 +24,6 @@
#include <linux/uio.h>
#include <linux/smp.h>
#include <linux/mutex.h>
-#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/inetdevice.h>
@@ -135,11 +134,11 @@ lockd(void *vrqstp)
* The main request loop. We don't terminate until the last
* NFS mount or NFS daemon has gone away.
*/
- while (!kthread_should_stop()) {
+ while (!svc_thread_should_stop(rqstp)) {
/* update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nlm_max_connections;
- nlmsvc_retry_blocked();
+ nlmsvc_retry_blocked(rqstp);
svc_recv(rqstp);
}
if (nlmsvc_ops)
@@ -373,7 +372,9 @@ static void lockd_put(void)
unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
#endif
+ svc_get(nlmsvc_serv);
svc_set_num_threads(nlmsvc_serv, NULL, 0);
+ svc_put(nlmsvc_serv);
timer_delete_sync(&nlmsvc_retry);
nlmsvc_serv = NULL;
dprintk("lockd_down: service destroyed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 43aeba9de55c..2dc10900ad1c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
-#include <linux/kthread.h>
#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -481,9 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
{
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
struct inode *inode = nlmsvc_file_inode(file);
-#endif
struct nlm_block *block = NULL;
int error;
int mode;
@@ -497,7 +494,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- if (nlmsvc_file_file(file)->f_op->lock) {
+ if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
async_block = wait;
wait = 0;
}
@@ -543,6 +540,25 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ spin_lock(&nlm_blocked_lock);
+ /*
+ * If this is a lock request for an already pending
+ * lock request we return nlm_lck_blocked without calling
+ * vfs_lock_file() again. Otherwise we have two pending
+ * requests on the underlaying ->lock() implementation but
+ * only one nlm_block to being granted by lm_grant().
+ */
+ if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+ !list_empty(&block->b_list)) {
+ spin_unlock(&nlm_blocked_lock);
+ ret = nlm_lck_blocked;
+ goto out;
+ }
+
+ /* Append to list of blocked */
+ nlmsvc_insert_block_locked(block, NLM_NEVER);
+ spin_unlock(&nlm_blocked_lock);
+
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
mode = lock_to_openmode(&lock->fl);
@@ -552,16 +568,12 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
dprintk("lockd: vfs_lock_file returned %d\n", error);
switch (error) {
case 0:
+ nlmsvc_remove_block(block);
ret = nlm_granted;
goto out;
case -EAGAIN:
- /*
- * If this is a blocking request for an
- * already pending lock request then we need
- * to put it back on lockd's block list
- */
- if (wait)
- break;
+ if (!wait)
+ nlmsvc_remove_block(block);
ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
goto out;
case FILE_LOCK_DEFERRED:
@@ -572,17 +584,16 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
ret = nlmsvc_defer_lock_rqst(rqstp, block);
goto out;
case -EDEADLK:
+ nlmsvc_remove_block(block);
ret = nlm_deadlock;
goto out;
default: /* includes ENOLCK */
+ nlmsvc_remove_block(block);
ret = nlm_lck_denied_nolocks;
goto out;
}
ret = nlm_lck_blocked;
-
- /* Append to list of blocked */
- nlmsvc_insert_block(block, NLM_NEVER);
out:
mutex_unlock(&file->f_mutex);
nlmsvc_release_block(block);
@@ -1020,13 +1031,13 @@ retry_deferred_block(struct nlm_block *block)
* be retransmitted.
*/
void
-nlmsvc_retry_blocked(void)
+nlmsvc_retry_blocked(struct svc_rqst *rqstp)
{
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
struct nlm_block *block;
spin_lock(&nlm_blocked_lock);
- while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
+ while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) {
block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
if (block->b_when == NLM_NEVER)
diff --git a/fs/locks.c b/fs/locks.c
index 76ad05f8070a..d4e49a990a8d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2264,11 +2264,13 @@ out:
* To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
* locks, the ->lock() interface may return asynchronously, before the lock has
* been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Callers expecting ->lock() to return asynchronously
- * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
- * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
- * request completes.
+ * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
+ * flags need to be set.
+ *
+ * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
+ * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
+ * blocking lock. When ->lock() does return asynchronously, it must return
+ * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
* If the request is for non-blocking lock the file system should return
* FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
* with the result. If the request timed out the callback routine will return a
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 25c08fbfcb9d..7da66ca184f4 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
}
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = j;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
insert_inode_hash(inode);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 20f23e6e58ad..62c313fc9a49 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -281,7 +281,7 @@ got_it:
de->inode = inode->i_ino;
}
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = minix_handle_dirsync(dir);
out_put:
@@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
else
de->inode = 0;
dir_commit_chunk(page, pos, len);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return minix_handle_dirsync(inode);
}
@@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
else
de->inode = inode->i_ino;
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
return minix_handle_dirsync(dir);
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df575473c1cc..f8af6c3ae336 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -501,7 +501,8 @@ static struct inode *V1_minix_iget(struct inode *inode)
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
- inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0);
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime(inode, raw_inode->i_time, 0)));
inode->i_blocks = 0;
for (i = 0; i < 9; i++)
minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
@@ -538,11 +539,9 @@ static struct inode *V2_minix_iget(struct inode *inode)
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
- inode->i_mtime.tv_sec = raw_inode->i_mtime;
- inode->i_atime.tv_sec = raw_inode->i_atime;
+ inode_set_mtime(inode, raw_inode->i_mtime, 0);
+ inode_set_atime(inode, raw_inode->i_atime, 0);
inode_set_ctime(inode, raw_inode->i_ctime, 0);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
inode->i_blocks = 0;
for (i = 0; i < 10; i++)
minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
@@ -589,7 +588,7 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode)
raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
raw_inode->i_nlinks = inode->i_nlink;
raw_inode->i_size = inode->i_size;
- raw_inode->i_time = inode->i_mtime.tv_sec;
+ raw_inode->i_time = inode_get_mtime_sec(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 9; i++)
@@ -616,9 +615,9 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
raw_inode->i_nlinks = inode->i_nlink;
raw_inode->i_size = inode->i_size;
- raw_inode->i_mtime = inode->i_mtime.tv_sec;
- raw_inode->i_atime = inode->i_atime.tv_sec;
- raw_inode->i_ctime = inode_get_ctime(inode).tv_sec;
+ raw_inode->i_mtime = inode_get_mtime_sec(inode);
+ raw_inode->i_atime = inode_get_atime_sec(inode);
+ raw_inode->i_ctime = inode_get_ctime_sec(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 10; i++)
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index ce18ae37c29d..dad131e30c05 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -350,7 +350,7 @@ do_indirects:
}
first_whole++;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
}
diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..71c13b2990b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
}
}
- result->refcnt = 1;
+ atomic_set(&result->refcnt, 1);
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
@@ -249,7 +249,7 @@ getname_kernel(const char * filename)
memcpy((char *)result->name, filename, len);
result->uptr = NULL;
result->aname = NULL;
- result->refcnt = 1;
+ atomic_set(&result->refcnt, 1);
audit_getname(result);
return result;
@@ -261,9 +261,10 @@ void putname(struct filename *name)
if (IS_ERR(name))
return;
- BUG_ON(name->refcnt <= 0);
+ if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
+ return;
- if (--name->refcnt > 0)
+ if (!atomic_dec_and_test(&name->refcnt))
return;
if (name->name != name->iname) {
@@ -3104,25 +3105,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
EXPORT_SYMBOL(unlock_rename);
/**
- * mode_strip_umask - handle vfs umask stripping
- * @dir: parent directory of the new inode
- * @mode: mode of the new inode to be created in @dir
- *
- * Umask stripping depends on whether or not the filesystem supports POSIX
- * ACLs. If the filesystem doesn't support it umask stripping is done directly
- * in here. If the filesystem does support POSIX ACLs umask stripping is
- * deferred until the filesystem calls posix_acl_create().
- *
- * Returns: mode
- */
-static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
-{
- if (!IS_POSIXACL(dir))
- mode &= ~current_umask();
- return mode;
-}
-
-/**
* vfs_prepare_mode - prepare the mode to be used for a new inode
* @idmap: idmap of the mount the inode was found from
* @dir: parent directory of the new inode
@@ -3535,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd,
if (likely(dentry))
goto finish_lookup;
- BUG_ON(nd->flags & LOOKUP_RCU);
+ if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
+ return ERR_PTR(-ECHILD);
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) {
@@ -3802,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- fput(file);
+ if (unlikely(file->f_mode & FMODE_OPENED))
+ fput(file);
+ else
+ release_empty_file(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -4386,11 +4372,9 @@ retry_deleg:
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
- if (last.name[last.len])
+ if (last.name[last.len] || d_is_negative(dentry))
goto slashes;
inode = dentry->d_inode;
- if (d_is_negative(dentry))
- goto slashes;
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..6bde71735efa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
* can determine when writes are able to occur to a filesystem.
*/
/**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is about to be performed to
* it, and makes sure that writes are allowed (mnt it read-write) before
* returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
* called. This is effectively a refcount.
*/
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int ret = 0;
@@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m)
return ret;
}
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
/**
* mnt_want_write - get write access to a mount
@@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m)
int ret;
sb_start_write(m->mnt_sb);
- ret = __mnt_want_write(m);
+ ret = mnt_get_write_access(m);
if (ret)
sb_end_write(m->mnt_sb);
return ret;
@@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m)
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
* skips incrementing mnt_writers (since the open file already has a reference)
* and instead only does the check for emergency r/o remounts. This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
*/
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
{
if (file->f_mode & FMODE_WRITER) {
/*
@@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file)
return -EROFS;
return 0;
}
- return __mnt_want_write(file->f_path.mnt);
+ return mnt_get_write_access(file->f_path.mnt);
}
/**
@@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file)
int ret;
sb_start_write(file_inode(file)->i_sb);
- ret = __mnt_want_write_file(file);
+ ret = mnt_get_write_access_file(file);
if (ret)
sb_end_write(file_inode(file)->i_sb);
return ret;
@@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file)
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
*/
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
{
preempt_disable();
mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
/**
* mnt_drop_write - give up write access to a mount
@@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
*/
void mnt_drop_write(struct vfsmount *mnt)
{
- __mnt_drop_write(mnt);
+ mnt_put_write_access(mnt);
sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
{
if (!(file->f_mode & FMODE_WRITER))
- __mnt_drop_write(file->f_path.mnt);
+ mnt_put_write_access(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
{
- __mnt_drop_write_file(file);
+ mnt_put_write_access_file(file);
sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);
@@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt)
{
if (mnt) {
struct mount *m = real_mount(mnt);
- /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+ /* avoid cacheline pingpong */
if (unlikely(m->mnt_expiry_mark))
- m->mnt_expiry_mark = 0;
+ WRITE_ONCE(m->mnt_expiry_mark, 0);
mntput_no_expire(m);
}
}
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3404707ddbe7..2cd3ccf4c439 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
+ bool folio_started;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
+ folio_started = false;
for (;;) {
loff_t sreq_end;
@@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
pg_failed = true;
break;
}
- if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
folio_start_fscache(folio);
+ folio_started = true;
+ }
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
if (pg_end < sreq_end)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 716bc75e9ed2..b4294a8aa2d4 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
struct pnfs_block_dev *children;
u64 chunk_size;
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
u64 disk_offset;
u64 pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 65cbb5607a5f..f318a05a80e1 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
} else {
if (dev->pr_registered) {
const struct pr_ops *ops =
- dev->bdev->bd_disk->fops->pr_ops;
+ dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
int error;
- error = ops->pr_register(dev->bdev, dev->pr_key, 0,
- false);
+ error = ops->pr_register(dev->bdev_handle->bdev,
+ dev->pr_key, 0, false);
if (error)
pr_err("failed to unregister PR key.\n");
}
- if (dev->bdev)
- blkdev_put(dev->bdev, NULL);
+ if (dev->bdev_handle)
+ bdev_release(dev->bdev_handle);
}
}
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
map->start = dev->start;
map->len = dev->len;
map->disk_offset = dev->disk_offset;
- map->bdev = dev->bdev;
+ map->bdev = dev->bdev_handle->bdev;
return true;
}
@@ -236,28 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
- NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ NULL, NULL);
+ if (IS_ERR(bdev_handle)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
- return PTR_ERR(bdev);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
+ return PTR_ERR(bdev_handle);
}
- d->bdev = bdev;
-
-
- d->len = bdev_nr_bytes(d->bdev);
+ d->bdev_handle = bdev_handle;
+ d->len = bdev_nr_bytes(bdev_handle->bdev);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
- d->bdev->bd_disk->disk_name);
+ bdev_handle->bdev->bd_disk->disk_name);
return 0;
}
@@ -302,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
-static struct block_device *
+static struct bdev_handle *
bl_open_path(struct pnfs_block_volume *v, const char *prefix)
{
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
const char *devname;
devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -313,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
if (!devname)
return ERR_PTR(-ENOMEM);
- bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL,
- NULL);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ NULL, NULL);
+ if (IS_ERR(bdev_handle)) {
pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(bdev));
+ devname, PTR_ERR(bdev_handle));
}
kfree(devname);
- return bdev;
+ return bdev_handle;
}
static int
@@ -329,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct block_device *bdev;
+ struct bdev_handle *bdev_handle;
const struct pr_ops *ops;
int error;
@@ -342,32 +340,32 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
* On other distributions like Debian, the default SCSI by-id path will
* point to the dm-multipath device if one exists.
*/
- bdev = bl_open_path(v, "dm-uuid-mpath-0x");
- if (IS_ERR(bdev))
- bdev = bl_open_path(v, "wwn-0x");
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- d->bdev = bdev;
-
- d->len = bdev_nr_bytes(d->bdev);
+ bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
+ if (IS_ERR(bdev_handle))
+ bdev_handle = bl_open_path(v, "wwn-0x");
+ if (IS_ERR(bdev_handle))
+ return PTR_ERR(bdev_handle);
+ d->bdev_handle = bdev_handle;
+
+ d->len = bdev_nr_bytes(d->bdev_handle->bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
- d->bdev->bd_disk->disk_name, d->pr_key);
+ d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
- ops = d->bdev->bd_disk->fops->pr_ops;
+ ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: block device %s does not support reservations.",
- d->bdev->bd_disk->disk_name);
+ d->bdev_handle->bdev->bd_disk->disk_name);
error = -EINVAL;
goto out_blkdev_put;
}
- error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+ error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
if (error) {
pr_err("pNFS: failed to register key for block device %s.",
- d->bdev->bd_disk->disk_name);
+ d->bdev_handle->bdev->bd_disk->disk_name);
goto out_blkdev_put;
}
@@ -375,7 +373,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- blkdev_put(d->bdev, NULL);
+ bdev_release(d->bdev_handle);
return error;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 466ebf1d41b2..4ffa1f469e90 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
- while (!kthread_freezable_should_stop(NULL))
+ while (!svc_thread_should_stop(rqstp))
svc_recv(rqstp);
svc_exit_thread(rqstp);
@@ -86,45 +86,6 @@ nfs4_callback_svc(void *vrqstp)
}
#if defined(CONFIG_NFS_V4_1)
-/*
- * The callback service for NFSv4.1 callbacks
- */
-static int
-nfs41_callback_svc(void *vrqstp)
-{
- struct svc_rqst *rqstp = vrqstp;
- struct svc_serv *serv = rqstp->rq_server;
- struct rpc_rqst *req;
- int error;
- DEFINE_WAIT(wq);
-
- set_freezable();
-
- while (!kthread_freezable_should_stop(NULL)) {
- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
- spin_lock_bh(&serv->sv_cb_lock);
- if (!list_empty(&serv->sv_cb_list)) {
- req = list_first_entry(&serv->sv_cb_list,
- struct rpc_rqst, rq_bc_list);
- list_del(&req->rq_bc_list);
- spin_unlock_bh(&serv->sv_cb_lock);
- finish_wait(&serv->sv_cb_waitq, &wq);
- dprintk("Invoking bc_svc_process()\n");
- error = bc_svc_process(serv, req, rqstp);
- dprintk("bc_svc_process() returned w/ error code= %d\n",
- error);
- } else {
- spin_unlock_bh(&serv->sv_cb_lock);
- if (!kthread_should_stop())
- schedule();
- finish_wait(&serv->sv_cb_waitq, &wq);
- }
- }
-
- svc_exit_thread(rqstp);
- return 0;
-}
-
static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
struct svc_serv *serv)
{
@@ -237,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
cb_info->users);
threadfn = nfs4_callback_svc;
-#if defined(CONFIG_NFS_V4_1)
- if (minorversion)
- threadfn = nfs41_callback_svc;
-#else
+#if !defined(CONFIG_NFS_V4_1)
if (minorversion)
return ERR_PTR(-ENOTSUPP);
#endif
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 6bed1394d748..96a4923080ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -60,7 +60,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
if (nfs_have_writebacks(inode))
res->change_attr++;
res->ctime = inode_get_ctime(inode);
- res->mtime = inode->i_mtime;
+ res->mtime = inode_get_mtime(inode);
res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
args->bitmap[0];
res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 47d892a1d363..f6c74f424691 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
dreq->max_count = dreq_len;
if (dreq->count > dreq_len)
dreq->count = dreq_len;
-
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
- else /* Clear outstanding error if this is EOF */
- dreq->error = 0;
}
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
+ dreq->error = hdr->error;
}
static void
@@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
dreq->count = dreq_len;
}
+static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
+ struct nfs_page *req)
+{
+ loff_t offs = req_offset(req);
+ size_t req_start = (size_t)(offs - dreq->io_start);
+
+ if (req_start < dreq->max_count)
+ dreq->max_count = req_start;
+ if (req_start < dreq->count)
+ dreq->count = req_start;
+}
+
/**
* nfs_swap_rw - NFS address space operation for swap I/O
* @iocb: target I/O control block
@@ -488,7 +498,9 @@ static void nfs_direct_add_page_head(struct list_head *list,
kref_get(&head->wb_kref);
}
-static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
+static void nfs_direct_join_group(struct list_head *list,
+ struct nfs_commit_info *cinfo,
+ struct inode *inode)
{
struct nfs_page *req, *subreq;
@@ -510,7 +522,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode)
nfs_release_request(subreq);
}
} while ((subreq = subreq->wb_this_page) != req);
- nfs_join_page_group(req, inode);
+ nfs_join_page_group(req, cinfo, inode);
}
}
@@ -528,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
- struct nfs_page *req, *tmp;
+ struct nfs_page *req;
LIST_HEAD(reqs);
struct nfs_commit_info cinfo;
- LIST_HEAD(failed);
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
- nfs_direct_join_group(&reqs, dreq->inode);
+ nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
- dreq->count = 0;
- dreq->max_count = 0;
- list_for_each_entry(req, &reqs, wb_list)
- dreq->max_count += req->wb_bytes;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
get_dreq(dreq);
@@ -549,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
- list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
/* Bump the transmission count */
req->wb_nio++;
if (!nfs_pageio_add_request(&desc, req)) {
- nfs_list_move_request(req, &failed);
- spin_lock(&cinfo.inode->i_lock);
- dreq->flags = 0;
- if (desc.pg_error < 0)
+ spin_lock(&dreq->lock);
+ if (dreq->error < 0) {
+ desc.pg_error = dreq->error;
+ } else if (desc.pg_error != -EAGAIN) {
+ dreq->flags = 0;
+ if (!desc.pg_error)
+ desc.pg_error = -EIO;
dreq->error = desc.pg_error;
- else
- dreq->error = -EIO;
- spin_unlock(&cinfo.inode->i_lock);
+ } else
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ break;
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
- while (!list_empty(&failed)) {
- req = nfs_list_entry(failed.next);
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
nfs_list_remove_request(req);
nfs_unlock_and_release_request(req);
+ if (desc.pg_error == -EAGAIN) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ } else {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ }
}
if (put_dreq(dreq))
@@ -589,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
if (status < 0) {
/* Errors in commit are fatal */
dreq->error = status;
- dreq->max_count = 0;
- dreq->count = 0;
dreq->flags = NFS_ODIRECT_DONE;
} else {
status = dreq->error;
@@ -601,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+ if (status < 0) {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ } else if (!nfs_write_match_verf(verf, req)) {
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
/*
* Despite the reboot, the write was successful,
@@ -609,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
*/
req->wb_nio = 0;
nfs_mark_request_commit(req, NULL, &cinfo, 0);
- } else /* Error or match */
+ } else
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -662,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
while (!list_empty(&reqs)) {
req = nfs_list_entry(reqs.next);
nfs_list_remove_request(req);
+ nfs_direct_truncate_request(dreq, req);
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -711,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) {
+ if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
+ !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
if (!dreq->flags)
dreq->flags = NFS_ODIRECT_DO_COMMIT;
flags = dreq->flags;
@@ -755,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error)
static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
+ struct nfs_page *req;
+ struct nfs_commit_info cinfo;
trace_nfs_direct_write_reschedule_io(dreq);
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
- if (dreq->error == 0) {
+ if (dreq->error == 0)
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- /* fake unstable write to let common nfs resend pages */
- hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = hdr->args.offset + hdr->args.count -
- hdr->io_start;
- }
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
spin_unlock(&dreq->lock);
+ while (!list_empty(&hdr->pages)) {
+ req = nfs_list_entry(hdr->pages.next);
+ nfs_list_remove_request(req);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ }
}
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
@@ -794,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
+ struct nfs_commit_info cinfo;
ssize_t result = 0;
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+ bool defer = false;
trace_nfs_direct_write_schedule_iovec(dreq);
@@ -837,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
break;
}
- nfs_lock_request(req);
- if (!nfs_pageio_add_request(&desc, req)) {
- result = desc.pg_error;
- nfs_unlock_and_release_request(req);
- break;
- }
pgbase = 0;
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
dreq->bytes_left -= req_len;
+
+ if (defer) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ continue;
+ }
+
+ nfs_lock_request(req);
+ if (nfs_pageio_add_request(&desc, req))
+ continue;
+
+ /* Exit on hard errors */
+ if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+
+ /* If the error is soft, defer remaining requests */
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ desc.pg_error = 0;
+ defer = true;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7deb3cd76abe..ef817a0475ff 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
case -EOPNOTSUPP:
+ case -EINVAL:
case -ECONNREFUSED:
case -ECONNRESET:
case -EHOSTDOWN:
@@ -2519,9 +2520,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
return i;
}
-static int
-ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
+ struct pnfs_layout_hdr *lo;
struct nfs4_flexfile_layout *ff_layout;
const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
@@ -2532,11 +2533,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
return -ENOMEM;
spin_lock(&args->inode->i_lock);
- ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
- args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
- &args->devinfo[0],
- dev_count,
- NFS4_FF_OP_LAYOUTSTATS);
+ lo = NFS_I(args->inode)->layout;
+ if (lo && pnfs_layout_is_valid(lo)) {
+ ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ args->num_dev = ff_layout_mirror_prepare_stats(
+ &ff_layout->generic_hdr, &args->devinfo[0], dev_count,
+ NFS4_FF_OP_LAYOUTSTATS);
+ } else
+ args->num_dev = 0;
spin_unlock(&args->inode->i_lock);
if (!args->num_dev) {
kfree(args->devinfo);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 2dc64454492b..5407ab8c8783 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -114,8 +114,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
struct inode *inode)
{
memset(auxdata, 0, sizeof(*auxdata));
- auxdata->mtime_sec = inode->i_mtime.tv_sec;
- auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+ auxdata->mtime_sec = inode_get_mtime(inode).tv_sec;
+ auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec;
auxdata->ctime_sec = inode_get_ctime(inode).tv_sec;
auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e21c073158e5..ebb8d60e1152 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -512,8 +512,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
} else
init_special_inode(inode, inode->i_mode, fattr->rdev);
- memset(&inode->i_atime, 0, sizeof(inode->i_atime));
- memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+ inode_set_atime(inode, 0, 0);
+ inode_set_mtime(inode, 0, 0);
inode_set_ctime(inode, 0, 0);
inode_set_iversion_raw(inode, 0);
inode->i_size = 0;
@@ -527,11 +527,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
nfsi->read_cache_jiffies = fattr->time_start;
nfsi->attr_gencount = fattr->gencount;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
- inode->i_atime = fattr->atime;
+ inode_set_atime_to_ts(inode, fattr->atime);
else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
@@ -742,9 +742,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME
| NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
- inode->i_atime = fattr->atime;
+ inode_set_atime_to_ts(inode, fattr->atime);
else if (attr->ia_valid & ATTR_ATIME_SET)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
@@ -758,9 +758,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME
| NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
else if (attr->ia_valid & ATTR_MTIME_SET)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
@@ -1451,11 +1451,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode_set_ctime_to_ts(inode, fattr->ctime);
}
- ts = inode->i_mtime;
+ ts = inode_get_mtime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
&& timespec64_equal(&ts, &fattr->pre_mtime)) {
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
}
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
@@ -1506,7 +1506,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr))
invalid |= NFS_INO_INVALID_CHANGE;
- ts = inode->i_mtime;
+ ts = inode_get_mtime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
invalid |= NFS_INO_INVALID_MTIME;
@@ -1534,7 +1534,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
invalid |= NFS_INO_INVALID_NLINK;
- ts = inode->i_atime;
+ ts = inode_get_atime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime))
invalid |= NFS_INO_INVALID_ATIME;
@@ -2002,7 +2002,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
}
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
- fattr->pre_mtime = inode->i_mtime;
+ fattr->pre_mtime = inode_get_mtime(inode);
fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
}
if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
@@ -2184,7 +2184,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
- inode->i_mtime = fattr->mtime;
+ inode_set_mtime_to_ts(inode, fattr->mtime);
else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_MTIME;
@@ -2220,7 +2220,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
save_cache_validity & NFS_INO_INVALID_SIZE;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
- inode->i_atime = fattr->atime;
+ inode_set_atime_to_ts(inode, fattr->atime);
else if (fattr_supported & NFS_ATTR_FATTR_ATIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_ATIME;
diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h
index 5ba00610aede..0d3ce0460e35 100644
--- a/fs/nfs/nfs.h
+++ b/fs/nfs/nfs.h
@@ -18,7 +18,7 @@ struct nfs_subversion {
const struct rpc_version *rpc_vers; /* NFS version information */
const struct nfs_rpc_ops *rpc_ops; /* NFS operations */
const struct super_operations *sops; /* NFS Super operations */
- const struct xattr_handler **xattr; /* NFS xattr handlers */
+ const struct xattr_handler * const *xattr; /* NFS xattr handlers */
struct list_head list; /* List of NFS versions */
};
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 063e00aff87e..28704f924612 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -81,7 +81,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
if (status == 0) {
if (nfs_should_remove_suid(inode)) {
spin_lock(&inode->i_lock);
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ nfs_set_cache_invalid(inode,
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE);
spin_unlock(&inode->i_lock);
}
status = nfs_post_op_update_inode_force_wcc(inode,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 47c5c1f86d66..827d00e2f094 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -315,7 +315,7 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *,
struct nfs_fh *,
struct nfs_fattr *);
extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern const struct xattr_handler * const nfs4_xattr_handlers[];
extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
const struct nfs_open_context *ctx,
const struct nfs_lock_context *l_ctx,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 27fb25567ce7..11e3a285594c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -417,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
.net = old->cl_net,
.servername = old->cl_hostname,
};
+ int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ?
+ clp->cl_max_connect : old->cl_max_connect;
if (clp->cl_proto != old->cl_proto)
return;
@@ -430,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
xprt_args.addrlen = clp_salen;
rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args,
- rpc_clnt_test_and_add_xprt, NULL);
+ rpc_clnt_test_and_add_xprt, &max_connect);
}
/**
@@ -1010,6 +1012,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
__set_bit(NFS_CS_DS, &cl_init.init_flags);
+ __set_bit(NFS_CS_PNFS, &cl_init.init_flags);
+ cl_init.max_connect = NFS_MAX_TRANSPORTS;
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
* cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 794343790ea8..a654d7234f51 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2703,8 +2703,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
+ struct nfs_fh *fh = &o_res->fh;
+
nfs4_sequence_free_slot(&o_res->seq_res);
- nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL);
+ if (o_arg->claim == NFS4_OPEN_CLAIM_FH)
+ fh = NFS_FH(d_inode(data->dentry));
+ nfs4_proc_getattr(server, fh, o_res->f_attr, NULL);
}
return 0;
}
@@ -8866,8 +8870,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre
/* Save the EXCHANGE_ID verifier session trunk tests */
memcpy(clp->cl_confirm.data, argp->verifier.data,
sizeof(clp->cl_confirm.data));
- if (resp->flags & EXCHGID4_FLAG_USE_PNFS_DS)
- set_bit(NFS_CS_DS, &clp->cl_flags);
out:
trace_nfs4_exchange_id(clp, status);
rpc_put_task(task);
@@ -10618,7 +10620,9 @@ static void nfs4_disable_swap(struct inode *inode)
*/
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
- nfs4_schedule_state_manager(clp);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ wake_up_var(&clp->cl_state);
}
static const struct inode_operations nfs4_dir_inode_operations = {
@@ -10733,7 +10737,7 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
};
#endif
-const struct xattr_handler *nfs4_xattr_handlers[] = {
+const struct xattr_handler * const nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
#if defined(CONFIG_NFS_V4_1)
&nfs4_xattr_nfs4_dacl_handler,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e079987af4a3..9a5d911a7edc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
+ bool swapon = false;
- if (clp->cl_rpcclient->cl_shutdown)
+ if (clnt->cl_shutdown)
return;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
- if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
- wake_up_var(&clp->cl_state);
- return;
+
+ if (atomic_read(&clnt->cl_swapper)) {
+ swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
+ &clp->cl_state);
+ if (!swapon) {
+ wake_up_var(&clp->cl_state);
+ return;
+ }
}
- set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+
+ if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ return;
+
__module_get(THIS_MODULE);
refcount_inc(&clp->cl_count);
@@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
__func__, PTR_ERR(task));
if (!nfs_client_init_is_complete(clp))
nfs_mark_client_ready(clp, PTR_ERR(task));
+ if (swapon)
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs4_clear_state_manager_bit(clp);
- clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs_put_client(clp);
module_put(THIS_MODULE);
}
@@ -2703,6 +2714,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
+ if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
+ &clp->cl_state)) {
+ memflags = memalloc_nofs_save();
+ continue;
+ }
+
if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
nfs_client_return_marked_delegations(clp);
@@ -2741,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr)
allow_signal(SIGKILL);
again:
- set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
nfs4_state_manager(clp);
- if (atomic_read(&cl->cl_swapper)) {
+
+ if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
wait_var_event_interruptible(&clp->cl_state,
test_bit(NFS4CLNT_RUN_MANAGER,
&clp->cl_state));
- if (atomic_read(&cl->cl_swapper) &&
- test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+ if (!atomic_read(&cl->cl_swapper))
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
goto again;
/* Either no longer a swapper, or were signalled */
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
}
- clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
- !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
goto again;
nfs_put_client(clp);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 306cba0b9e69..84343aefbbd6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2634,31 +2634,44 @@ pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
return mode == 0;
}
-static int
-pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
+static int pnfs_layout_return_unused_byserver(struct nfs_server *server,
+ void *data)
{
const struct pnfs_layout_range *range = data;
+ const struct cred *cred;
struct pnfs_layout_hdr *lo;
struct inode *inode;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+
restart:
rcu_read_lock();
list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
- if (!pnfs_layout_can_be_returned(lo) ||
+ inode = lo->plh_inode;
+ if (!inode || !pnfs_layout_can_be_returned(lo) ||
test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
continue;
- inode = lo->plh_inode;
spin_lock(&inode->i_lock);
- if (!pnfs_should_return_unused_layout(lo, range)) {
+ if (!lo->plh_inode ||
+ !pnfs_should_return_unused_layout(lo, range)) {
spin_unlock(&inode->i_lock);
continue;
}
+ pnfs_get_layout_hdr(lo);
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ range, 0) != 0 ||
+ !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) {
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ pnfs_put_layout_hdr(lo);
+ cond_resched();
+ goto restart;
+ }
spin_unlock(&inode->i_lock);
- inode = pnfs_grab_inode_layout_hdr(lo);
- if (!inode)
- continue;
rcu_read_unlock();
- pnfs_mark_layout_for_return(inode, range);
- iput(inode);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ pnfs_put_layout_hdr(lo);
cond_resched();
goto restart;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d6473cb00cb..9b1cfca8112a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
sb->s_export_op = &nfs_export_ops;
break;
case 4:
- sb->s_flags |= SB_POSIXACL;
+ sb->s_iflags |= SB_I_NOUMASK;
sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f4cca8f00c0c..9d82d50ce0b1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
static const struct nfs_rw_ops nfs_rw_write_ops;
static void nfs_inode_remove_request(struct nfs_page *req);
-static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req);
static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode);
static struct nfs_page *
@@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
* the (former) group. All subrequests are removed from any write or commit
* lists, unlinked from the group and destroyed.
*/
-void
-nfs_join_page_group(struct nfs_page *head, struct inode *inode)
+void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
+ struct inode *inode)
{
struct nfs_page *subreq;
struct nfs_page *destroy_list = NULL;
@@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode)
* Commit list removal accounting is done after locks are dropped */
subreq = head;
do {
- nfs_clear_request_commit(subreq);
+ nfs_clear_request_commit(cinfo, subreq);
subreq = subreq->wb_this_page;
} while (subreq != head);
@@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
{
struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_page *head;
+ struct nfs_commit_info cinfo;
int ret;
+ nfs_init_cinfo_from_inode(&cinfo, inode);
/*
* A reference is taken only on the head request which acts as a
* reference to the whole page group - the group will not be destroyed
@@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
return ERR_PTR(ret);
}
- nfs_join_page_group(head, inode);
+ nfs_join_page_group(head, &cinfo, inode);
return head;
}
@@ -785,6 +788,8 @@ static void nfs_inode_add_request(struct nfs_page *req)
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
+ struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
+
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
struct folio *folio = nfs_page_to_folio(req->wb_head);
struct address_space *mapping = folio_file_mapping(folio);
@@ -799,8 +804,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
+ atomic_long_dec(&nfsi->nrequests);
nfs_release_request(req);
- atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests);
}
}
@@ -955,18 +960,16 @@ static void nfs_folio_clear_commit(struct folio *folio)
}
/* Called holding the request lock on @req */
-static void
-nfs_clear_request_commit(struct nfs_page *req)
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
if (test_bit(PG_CLEAN, &req->wb_flags)) {
struct nfs_open_context *ctx = nfs_req_openctx(req);
struct inode *inode = d_inode(ctx->dentry);
- struct nfs_commit_info cinfo;
- nfs_init_cinfo_from_inode(&cinfo, inode);
mutex_lock(&NFS_I(inode)->commit_mutex);
- if (!pnfs_clear_request_commit(req, &cinfo)) {
- nfs_request_remove_commit_list(req, &cinfo);
+ if (!pnfs_clear_request_commit(req, cinfo)) {
+ nfs_request_remove_commit_list(req, cinfo);
}
mutex_unlock(&NFS_I(inode)->commit_mutex);
nfs_folio_clear_commit(nfs_page_to_folio(req));
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 6fffc8f03f74..b8736a82e57c 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,7 +12,8 @@ nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o \
- stats.o filecache.o nfs3proc.o nfs3xdr.o
+ stats.o filecache.o nfs3proc.o nfs3xdr.o \
+ netlink.o
nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 01d7fd108cf3..46fd74d91ea9 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -117,12 +117,13 @@ static __be32
nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
struct iomap *iomaps, int nr_iomaps)
{
+ struct timespec64 mtime = inode_get_mtime(inode);
loff_t new_size = lcp->lc_last_wr + 1;
struct iattr iattr = { .ia_valid = 0 };
int error;
if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
- timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+ timespec64_compare(&lcp->lc_mtime, &mtime) < 0)
lcp->lc_mtime = current_time(inode);
iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 1ed2f691ebb9..ce78f74715ee 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -16,9 +16,9 @@
__be32
nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp)
+ const struct nfsd4_layoutget *lgp)
{
- struct pnfs_block_extent *b = lgp->lg_content;
+ const struct pnfs_block_extent *b = lgp->lg_content;
int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
__be32 *p;
@@ -77,7 +77,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
__be32
nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp)
+ const struct nfsd4_getdeviceinfo *gdp)
{
struct pnfs_block_deviceaddr *dev = gdp->gd_device;
int len = sizeof(__be32), ret, i;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index bc5166bfe46b..b0361e8aa9a7 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -51,9 +51,9 @@ struct pnfs_block_deviceaddr {
};
__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp);
+ const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp);
+ const struct nfsd4_layoutget *lgp);
int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size);
int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 11a0eaa2f914..b7da17e53007 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -339,12 +339,16 @@ static int export_stats_init(struct export_stats *stats)
static void export_stats_reset(struct export_stats *stats)
{
- nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+ if (stats)
+ nfsd_percpu_counters_reset(stats->counter,
+ EXP_STATS_COUNTERS_NUM);
}
static void export_stats_destroy(struct export_stats *stats)
{
- nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+ if (stats)
+ nfsd_percpu_counters_destroy(stats->counter,
+ EXP_STATS_COUNTERS_NUM);
}
static void svc_export_put(struct kref *ref)
@@ -353,7 +357,8 @@ static void svc_export_put(struct kref *ref)
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
- export_stats_destroy(&exp->ex_stats);
+ export_stats_destroy(exp->ex_stats);
+ kfree(exp->ex_stats);
kfree(exp->ex_uuid);
kfree_rcu(exp, ex_rcu);
}
@@ -767,13 +772,15 @@ static int svc_export_show(struct seq_file *m,
seq_putc(m, '\t');
seq_escape(m, exp->ex_client->name, " \t\n\\");
if (export_stats) {
- seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
+ struct percpu_counter *counter = exp->ex_stats->counter;
+
+ seq_printf(m, "\t%lld\n", exp->ex_stats->start_time);
seq_printf(m, "\tfh_stale: %lld\n",
- percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
+ percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE]));
seq_printf(m, "\tio_read: %lld\n",
- percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
+ percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ]));
seq_printf(m, "\tio_write: %lld\n",
- percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
+ percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE]));
seq_putc(m, '\n');
return 0;
}
@@ -819,7 +826,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_layout_types = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
- export_stats_reset(&new->ex_stats);
+ export_stats_reset(new->ex_stats);
}
static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -856,7 +863,14 @@ static struct cache_head *svc_export_alloc(void)
if (!i)
return NULL;
- if (export_stats_init(&i->ex_stats)) {
+ i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL);
+ if (!i->ex_stats) {
+ kfree(i);
+ return NULL;
+ }
+
+ if (export_stats_init(i->ex_stats)) {
+ kfree(i->ex_stats);
kfree(i);
return NULL;
}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 2df8ae25aad3..ca9dc230ae3d 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -64,10 +64,10 @@ struct svc_export {
struct cache_head h;
struct auth_domain * ex_client;
int ex_flags;
+ int ex_fsid;
struct path ex_path;
kuid_t ex_anon_uid;
kgid_t ex_anon_gid;
- int ex_fsid;
unsigned char * ex_uuid; /* 16 byte fsid */
struct nfsd4_fs_locations ex_fslocs;
uint32_t ex_nflavors;
@@ -76,8 +76,8 @@ struct svc_export {
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
struct rcu_head ex_rcu;
- struct export_stats ex_stats;
unsigned long ex_xprtsec_modes;
+ struct export_stats *ex_stats;
};
/* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ee9c923192e0..07bf219f9ae4 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -989,22 +989,21 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
struct net *net = SVC_NET(rqstp);
struct nfsd_file *new, *nf;
- const struct cred *cred;
+ bool stale_retry = true;
bool open_retry = true;
struct inode *inode;
__be32 status;
int ret;
+retry:
status = fh_verify(rqstp, fhp, S_IFREG,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
if (status != nfs_ok)
return status;
inode = d_inode(fhp->fh_dentry);
- cred = get_current_cred();
-retry:
rcu_read_lock();
- nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+ nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
rcu_read_unlock();
if (nf) {
@@ -1026,7 +1025,7 @@ retry:
rcu_read_lock();
spin_lock(&inode->i_lock);
- nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+ nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
if (unlikely(nf)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
@@ -1058,6 +1057,7 @@ wait_for_construction:
goto construction_err;
}
open_retry = false;
+ fh_put(fhp);
goto retry;
}
this_cpu_inc(nfsd_file_cache_hits);
@@ -1074,7 +1074,6 @@ out:
nfsd_file_check_write_error(nf);
*pnf = nf;
}
- put_cred(cred);
trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
return status;
@@ -1088,8 +1087,20 @@ open_file:
status = nfs_ok;
trace_nfsd_file_opened(nf, status);
} else {
- status = nfsd_open_verified(rqstp, fhp, may_flags,
- &nf->nf_file);
+ ret = nfsd_open_verified(rqstp, fhp, may_flags,
+ &nf->nf_file);
+ if (ret == -EOPENSTALE && stale_retry) {
+ stale_retry = false;
+ nfsd_file_unhash(nf);
+ clear_and_wake_up_bit(NFSD_FILE_PENDING,
+ &nf->nf_flags);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
+ nf = NULL;
+ fh_put(fhp);
+ goto retry;
+ }
+ status = nfserrno(ret);
trace_nfsd_file_open(nf, status);
}
} else
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
index bb205328e043..aeb71c10ff1b 100644
--- a/fs/nfsd/flexfilelayoutxdr.c
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -17,9 +17,9 @@ struct ff_idmap {
__be32
nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp)
+ const struct nfsd4_layoutget *lgp)
{
- struct pnfs_ff_layout *fl = lgp->lg_content;
+ const struct pnfs_ff_layout *fl = lgp->lg_content;
int len, mirror_len, ds_len, fh_len;
__be32 *p;
@@ -77,7 +77,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
__be32
nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp)
+ const struct nfsd4_getdeviceinfo *gdp)
{
struct pnfs_ff_device_addr *da = gdp->gd_device;
int len;
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
index 8e195aeca023..6d5a1066a903 100644
--- a/fs/nfsd/flexfilelayoutxdr.h
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -43,8 +43,8 @@ struct pnfs_ff_layout {
};
__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdp);
+ const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
- struct nfsd4_layoutget *lgp);
+ const struct nfsd4_layoutget *lgp);
#endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
new file mode 100644
index 000000000000..0e1d635ec5f9
--- /dev/null
+++ b/fs/nfsd/netlink.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netlink.h"
+
+#include <uapi/linux/nfsd_netlink.h>
+
+/* Ops table for nfsd */
+static const struct genl_split_ops nfsd_nl_ops[] = {
+ {
+ .cmd = NFSD_CMD_RPC_STATUS_GET,
+ .start = nfsd_nl_rpc_status_get_start,
+ .dumpit = nfsd_nl_rpc_status_get_dumpit,
+ .done = nfsd_nl_rpc_status_get_done,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+};
+
+struct genl_family nfsd_nl_family __ro_after_init = {
+ .name = NFSD_FAMILY_NAME,
+ .version = NFSD_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = nfsd_nl_ops,
+ .n_split_ops = ARRAY_SIZE(nfsd_nl_ops),
+};
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
new file mode 100644
index 000000000000..d83dd6bdee92
--- /dev/null
+++ b/fs/nfsd/netlink.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/nfsd.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_NFSD_GEN_H
+#define _LINUX_NFSD_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/nfsd_netlink.h>
+
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
+
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+
+extern struct genl_family nfsd_nl_family;
+
+#endif /* _LINUX_NFSD_GEN_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 268ef57751c4..b78eceebd945 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -171,7 +171,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
* + 1 (xdr opaque byte count) = 26
*/
resp->count = argp->count;
- svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+ svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3) << 2) +
+ resp->count + 4);
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
@@ -194,7 +195,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
SVCFH_fmt(&argp->fh),
argp->len,
(unsigned long long) argp->offset,
- argp->stable? " stable" : "");
+ argp->stable ? " stable" : "");
resp->status = nfserr_fbig;
if (argp->offset > (u64)OFFSET_MAX ||
@@ -294,8 +295,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
status = nfserr_exist;
break;
case NFS3_CREATE_EXCLUSIVE:
- if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
- d_inode(child)->i_atime.tv_sec == v_atime &&
+ if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+ inode_get_atime_sec(d_inode(child)) == v_atime &&
d_inode(child)->i_size == 0) {
break;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4039ffcf90ba..92bc109dabe6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -84,7 +84,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n)
static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
size_t len)
{
- WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+ xdr_stream_encode_uint32_array(xdr, bitmap, len);
+}
+
+static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_cb_fattr *fattr)
+{
+ fattr->ncf_cb_change = 0;
+ fattr->ncf_cb_fsize = 0;
+ if (bitmap[0] & FATTR4_WORD0_CHANGE)
+ if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0)
+ return -NFSERR_BAD_XDR;
+ if (bitmap[0] & FATTR4_WORD0_SIZE)
+ if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0)
+ return -NFSERR_BAD_XDR;
+ return 0;
}
/*
@@ -358,6 +372,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr,
}
/*
+ * CB_GETATTR4args
+ * struct CB_GETATTR4args {
+ * nfs_fh4 fh;
+ * bitmap4 attr_request;
+ * };
+ *
+ * The size and change attributes are the only one
+ * guaranteed to be serviced by the client.
+ */
+static void
+encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
+ struct nfs4_cb_fattr *fattr)
+{
+ struct nfs4_delegation *dp =
+ container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
+ struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle;
+
+ encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR);
+ encode_nfs_fh4(xdr, fh);
+ encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap));
+ hdr->nops++;
+}
+
+/*
* CB_SEQUENCE4args
*
* struct CB_SEQUENCE4args {
@@ -493,6 +531,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
}
/*
+ * 20.1. Operation 3: CB_GETATTR - Get Attributes
+ */
+static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = cb->cb_clp->cl_cb_ident,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_getattr4args(xdr, &hdr, ncf);
+ encode_cb_nops(&hdr);
+}
+
+/*
* 20.2. Operation 4: CB_RECALL - Recall a Delegation
*/
static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -548,6 +606,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
}
/*
+ * 20.1. Operation 3: CB_GETATTR - Get Attributes
+ */
+static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+ u32 bitmap[3] = {0};
+ u32 attrlen;
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+
+ status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
+ if (status)
+ return status;
+ if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
+ return -NFSERR_BAD_XDR;
+ if (xdr_stream_decode_u32(xdr, &attrlen) < 0)
+ return -NFSERR_BAD_XDR;
+ if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize)))
+ return -NFSERR_BAD_XDR;
+ status = decode_cb_fattr4(xdr, bitmap, ncf);
+ return status;
+}
+
+/*
* 20.2. Operation 4: CB_RECALL - Recall a Delegation
*/
static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
@@ -855,6 +949,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any),
+ PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr),
};
static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index e8a80052cb1b..5e8096bc5eaa 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -515,11 +515,11 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
if (!list_empty(&ls->ls_layouts)) {
if (found)
nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
- lrp->lrs_present = 1;
+ lrp->lrs_present = true;
} else {
trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
nfs4_unhash_stid(&ls->ls_stid);
- lrp->lrs_present = 0;
+ lrp->lrs_present = false;
}
spin_unlock(&ls->ls_lock);
@@ -539,7 +539,7 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp,
struct nfs4_layout *lp, *t;
LIST_HEAD(reaplist);
- lrp->lrs_present = 0;
+ lrp->lrs_present = false;
spin_lock(&clp->cl_lock);
list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5ca748309c26..6f2d4aa4970d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -322,8 +322,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
status = nfserr_exist;
break;
case NFS4_CREATE_EXCLUSIVE:
- if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
- d_inode(child)->i_atime.tv_sec == v_atime &&
+ if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+ inode_get_atime_sec(d_inode(child)) == v_atime &&
d_inode(child)->i_size == 0) {
open->op_created = true;
break; /* subtle */
@@ -331,8 +331,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
status = nfserr_exist;
break;
case NFS4_CREATE_EXCLUSIVE4_1:
- if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
- d_inode(child)->i_atime.tv_sec == v_atime &&
+ if (inode_get_mtime_sec(d_inode(child)) == v_mtime &&
+ inode_get_atime_sec(d_inode(child)) == v_atime &&
d_inode(child)->i_size == 0) {
open->op_created = true;
goto set_attr; /* subtle */
@@ -1058,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
rename->rn_tname, rename->rn_tnamelen);
if (status)
return status;
- set_change_info(&rename->rn_sinfo, &cstate->current_fh);
- set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_sinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_tinfo, &cstate->current_fh);
return nfs_ok;
}
@@ -1329,7 +1329,8 @@ extern void nfs_sb_deactive(struct super_block *sb);
* setup a work entry in the ssc delayed unmount list.
*/
static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
- struct nfsd4_ssc_umount_item **nsui)
+ struct nfsd4_ssc_umount_item **nsui,
+ struct svc_rqst *rqstp)
{
struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *work = NULL;
@@ -1351,7 +1352,7 @@ try_again:
spin_unlock(&nn->nfsd_ssc_lock);
/* allow 20secs for mount/unmount for now - revisit */
- if (kthread_should_stop() ||
+ if (svc_thread_should_stop(rqstp) ||
(schedule_timeout(20*HZ) == 0)) {
finish_wait(&nn->nfsd_ssc_waitq, &wait);
kfree(work);
@@ -1467,7 +1468,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
goto out_free_rawdata;
snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
- status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
+ status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp);
if (status)
goto out_free_devname;
if ((*nsui)->nsui_vfsmount)
@@ -1642,6 +1643,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
if (bytes_total == 0)
bytes_total = ULLONG_MAX;
do {
+ /* Only async copies can be stopped here */
if (kthread_should_stop())
break;
bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
@@ -1760,6 +1762,7 @@ static int nfsd4_do_async_copy(void *data)
struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
__be32 nfserr;
+ trace_nfsd_copy_do_async(copy);
if (nfsd4_ssc_is_inter(copy)) {
struct file *filp;
@@ -1798,21 +1801,27 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_copy *async_copy = NULL;
+ copy->cp_clp = cstate->clp;
if (nfsd4_ssc_is_inter(copy)) {
+ trace_nfsd_copy_inter(copy);
if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) {
status = nfserr_notsupp;
goto out;
}
status = nfsd4_setup_inter_ssc(rqstp, cstate, copy);
- if (status)
+ if (status) {
+ trace_nfsd_copy_done(copy, status);
return nfserr_offload_denied;
+ }
} else {
+ trace_nfsd_copy_intra(copy);
status = nfsd4_setup_intra_ssc(rqstp, cstate, copy);
- if (status)
+ if (status) {
+ trace_nfsd_copy_done(copy, status);
return status;
+ }
}
- copy->cp_clp = cstate->clp;
memcpy(&copy->fh, &cstate->current_fh.fh_handle,
sizeof(struct knfsd_fh));
if (nfsd4_copy_is_async(copy)) {
@@ -1847,6 +1856,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
copy->nf_dst->nf_file, true);
}
out:
+ trace_nfsd_copy_done(copy, status);
release_copy_files(copy);
return status;
out_err:
@@ -1929,8 +1939,8 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
- cn->cpn_sec = nn->nfsd4_lease;
- cn->cpn_nsec = 0;
+ cn->cpn_lease_time.tv_sec = nn->nfsd4_lease;
+ cn->cpn_lease_time.tv_nsec = 0;
status = nfserrno(-ENOMEM);
cps = nfs4_alloc_init_cpntf_state(nn, stid);
@@ -2347,10 +2357,10 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
mutex_unlock(&ls->ls_mutex);
if (new_size > i_size_read(inode)) {
- lcp->lc_size_chg = 1;
+ lcp->lc_size_chg = true;
lcp->lc_newsize = new_size;
} else {
- lcp->lc_size_chg = 0;
+ lcp->lc_size_chg = false;
}
nfserr = ops->proc_layoutcommit(inode, lcp);
@@ -3200,6 +3210,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOCK] = {
.op_func = nfsd4_lock,
+ .op_release = nfsd4_lock_release,
.op_flags = OP_MODIFIES_SOMETHING |
OP_NONTRIVIAL_ERROR_ENCODE,
.op_name = "OP_LOCK",
@@ -3208,6 +3219,7 @@ static const struct nfsd4_operation nfsd4_ops[] = {
},
[OP_LOCKT] = {
.op_func = nfsd4_lockt,
+ .op_release = nfsd4_lockt_release,
.op_flags = OP_NONTRIVIAL_ERROR_ENCODE,
.op_name = "OP_LOCKT",
.op_rsize_bop = nfsd4_lock_rsize,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8534693eb6a4..65fd5510323a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -59,7 +59,7 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
-#define all_ones {{~0,~0},~0}
+#define all_ones {{ ~0, ~0}, ~0}
static const stateid_t one_stateid = {
.si_generation = ~0,
.si_opaque = all_ones,
@@ -127,6 +127,7 @@ static void free_session(struct nfsd4_session *);
static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops;
static struct workqueue_struct *laundry_wq;
@@ -297,7 +298,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
nbl = find_blocked_lock(lo, fh, nn);
if (!nbl) {
- nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+ nbl = kmalloc(sizeof(*nbl), GFP_KERNEL);
if (nbl) {
INIT_LIST_HEAD(&nbl->nbl_list);
INIT_LIST_HEAD(&nbl->nbl_lru);
@@ -1159,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
struct nfs4_clnt_odstate *odstate, u32 dl_type)
{
struct nfs4_delegation *dp;
+ struct nfs4_stid *stid;
long n;
dprintk("NFSD alloc_init_deleg\n");
@@ -1167,9 +1169,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
goto out_dec;
if (delegation_blocked(&fp->fi_fhandle))
goto out_dec;
- dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
- if (dp == NULL)
+ stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg);
+ if (stid == NULL)
goto out_dec;
+ dp = delegstateid(stid);
/*
* delegation seqid's are never incremented. The 4.1 special
@@ -1187,6 +1190,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
dp->dl_recalled = false;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
&nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+ nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client,
+ &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR);
+ dp->dl_cb_fattr.ncf_file_modified = false;
+ dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE;
get_nfs4_file(fp);
dp->dl_stid.sc_file = fp;
return dp;
@@ -2894,11 +2901,56 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
spin_unlock(&nn->client_lock);
}
+static int
+nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+ ncf->ncf_cb_status = task->tk_status;
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static void
+nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_fattr *ncf =
+ container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+ struct nfs4_delegation *dp =
+ container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+ nfs4_put_stid(&dp->dl_stid);
+ clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
+ wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
+}
+
static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
.done = nfsd4_cb_recall_any_done,
.release = nfsd4_cb_recall_any_release,
};
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
+ .done = nfsd4_cb_getattr_done,
+ .release = nfsd4_cb_getattr_release,
+};
+
+void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
+{
+ struct nfs4_delegation *dp =
+ container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+ if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags))
+ return;
+ refcount_inc(&dp->dl_stid.sc_count);
+ nfsd4_run_cb(&ncf->ncf_getattr);
+}
+
static struct nfs4_client *create_client(struct xdr_netobj name,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
@@ -5634,13 +5686,15 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
struct svc_fh *parent = NULL;
int cb_up;
int status = 0;
+ struct kstat stat;
+ struct path path;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
- open->op_recall = 0;
+ open->op_recall = false;
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_PREVIOUS:
if (!cb_up)
- open->op_recall = 1;
+ open->op_recall = true;
break;
case NFS4_OPEN_CLAIM_NULL:
parent = currentfh;
@@ -5671,6 +5725,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
+ path.mnt = currentfh->fh_export->ex_path.mnt;
+ path.dentry = currentfh->fh_dentry;
+ if (vfs_getattr(&path, &stat,
+ (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
+ AT_STATX_SYNC_AS_STAT)) {
+ nfs4_put_stid(&dp->dl_stid);
+ destroy_delegation(dp);
+ goto out_no_deleg;
+ }
+ dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
+ dp->dl_cb_fattr.ncf_initial_cinfo =
+ nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry));
} else {
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
@@ -5682,7 +5748,7 @@ out_no_deleg:
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
dprintk("NFSD: WARNING: refusing delegation reclaim\n");
- open->op_recall = 1;
+ open->op_recall = true;
}
/* 4.1 client asking for a delegation? */
@@ -7487,6 +7553,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_blocked_lock *nbl = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
+ struct super_block *sb;
__be32 status = 0;
int lkflg;
int err;
@@ -7508,6 +7575,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_lock: permission denied!\n");
return status;
}
+ sb = cstate->current_fh.fh_dentry->d_sb;
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
@@ -7559,7 +7627,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate))
+ if (nfsd4_has_session(cstate) ||
+ exportfs_lock_op_is_async(sb->s_export_op))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
@@ -7571,7 +7640,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fl_type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate))
+ if (nfsd4_has_session(cstate) ||
+ exportfs_lock_op_is_async(sb->s_export_op))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
@@ -7599,7 +7669,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* for file locks), so don't attempt blocking lock notifications
* on those filesystems:
*/
- if (nf->nf_file->f_op->lock)
+ if (!exportfs_lock_op_is_async(sb->s_export_op))
fl_flags &= ~FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
@@ -7705,6 +7775,14 @@ out:
return status;
}
+void nfsd4_lock_release(union nfsd4_op_u *u)
+{
+ struct nfsd4_lock *lock = &u->lock;
+ struct nfsd4_lock_denied *deny = &lock->lk_denied;
+
+ kfree(deny->ld_owner.data);
+}
+
/*
* The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
* so we do a temporary open here just to get an open file to pass to
@@ -7810,6 +7888,14 @@ out:
return status;
}
+void nfsd4_lockt_release(union nfsd4_op_u *u)
+{
+ struct nfsd4_lockt *lockt = &u->lockt;
+ struct nfsd4_lock_denied *deny = &lockt->lt_denied;
+
+ kfree(deny->ld_owner.data);
+}
+
__be32
nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -8403,6 +8489,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
* nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
* @rqstp: RPC transaction context
* @inode: file to be checked for a conflict
+ * @modified: return true if file was modified
+ * @size: new size of file if modified is true
*
* This function is called when there is a conflict between a write
* delegation and a change/size GETATTR from another client. The server
@@ -8411,21 +8499,23 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
* delegation before replying to the GETATTR. See RFC 8881 section
* 18.7.4.
*
- * The current implementation does not support CB_GETATTR yet. However
- * this can avoid recalling the delegation could be added in follow up
- * work.
- *
* Returns 0 if there is no conflict; otherwise an nfs_stat
* code is returned.
*/
__be32
-nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
+ bool *modified, u64 *size)
{
- __be32 status;
struct file_lock_context *ctx;
- struct file_lock *fl;
struct nfs4_delegation *dp;
+ struct nfs4_cb_fattr *ncf;
+ struct file_lock *fl;
+ struct iattr attrs;
+ __be32 status;
+ might_sleep();
+
+ *modified = false;
ctx = locks_inode_context(inode);
if (!ctx)
return 0;
@@ -8452,10 +8542,34 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
break_lease:
spin_unlock(&ctx->flc_lock);
nfsd_stats_wdeleg_getattr_inc();
- status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
- if (status != nfserr_jukebox ||
- !nfsd_wait_for_delegreturn(rqstp, inode))
- return status;
+
+ dp = fl->fl_owner;
+ ncf = &dp->dl_cb_fattr;
+ nfs4_cb_getattr(&dp->dl_cb_fattr);
+ wait_on_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, TASK_INTERRUPTIBLE);
+ if (ncf->ncf_cb_status) {
+ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+ if (status != nfserr_jukebox ||
+ !nfsd_wait_for_delegreturn(rqstp, inode))
+ return status;
+ }
+ if (!ncf->ncf_file_modified &&
+ (ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
+ ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
+ ncf->ncf_file_modified = true;
+ if (ncf->ncf_file_modified) {
+ /*
+ * The server would not update the file's metadata
+ * with the client's modified size.
+ */
+ attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
+ attrs.ia_valid = ATTR_MTIME | ATTR_CTIME;
+ setattr_copy(&nop_mnt_idmap, inode, &attrs);
+ mark_inode_dirty(inode);
+ ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
+ *size = ncf->ncf_cur_fsize;
+ *modified = true;
+ }
return 0;
}
break;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2e40c74d2f72..ec4ed6206df1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2530,66 +2530,62 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
return true;
}
-static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
- struct svc_export *exp)
+static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr,
+ struct knfsd_fh *fh_handle)
{
- if (exp->ex_flags & NFSEXP_V4ROOT) {
- *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
- *p++ = 0;
- } else
- p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
- return p;
+ return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size);
}
+/* This is a frequently-encoded type; open-coded for speed */
static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
- struct timespec64 *tv)
+ const struct timespec64 *tv)
{
__be32 *p;
p = xdr_reserve_space(xdr, XDR_UNIT * 3);
if (!p)
return nfserr_resource;
-
- p = xdr_encode_hyper(p, (s64)tv->tv_sec);
+ p = xdr_encode_hyper(p, tv->tv_sec);
*p = cpu_to_be32(tv->tv_nsec);
return nfs_ok;
}
-/*
- * ctime (in NFSv4, time_metadata) is not writeable, and the client
- * doesn't really care what resolution could theoretically be stored by
- * the filesystem.
- *
- * The client cares how close together changes can be while still
- * guaranteeing ctime changes. For most filesystems (which have
- * timestamps with nanosecond fields) that is limited by the resolution
- * of the time returned from current_time() (which I'm assuming to be
- * 1/HZ).
- */
-static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
+static __be32 nfsd4_encode_specdata4(struct xdr_stream *xdr,
+ unsigned int major, unsigned int minor)
{
- struct timespec64 ts;
- u32 ns;
+ __be32 status;
- ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
- ts = ns_to_timespec64(ns);
+ status = nfsd4_encode_uint32_t(xdr, major);
+ if (status != nfs_ok)
+ return status;
+ return nfsd4_encode_uint32_t(xdr, minor);
+}
- p = xdr_encode_hyper(p, ts.tv_sec);
- *p++ = cpu_to_be32(ts.tv_nsec);
+static __be32
+nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c)
+{
+ __be32 status;
- return p;
+ status = nfsd4_encode_bool(xdr, c->atomic);
+ if (status != nfs_ok)
+ return status;
+ status = nfsd4_encode_changeid4(xdr, c->before_change);
+ if (status != nfs_ok)
+ return status;
+ return nfsd4_encode_changeid4(xdr, c->after_change);
}
-static __be32
-nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c)
+static __be32 nfsd4_encode_netaddr4(struct xdr_stream *xdr,
+ const struct nfs42_netaddr *addr)
{
- if (xdr_stream_encode_bool(xdr, c->atomic) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u64(xdr, c->before_change) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u64(xdr, c->after_change) < 0)
- return nfserr_resource;
- return nfs_ok;
+ __be32 status;
+
+ /* na_r_netid */
+ status = nfsd4_encode_opaque(xdr, addr->netid, addr->netid_len);
+ if (status != nfs_ok)
+ return status;
+ /* na_r_addr */
+ return nfsd4_encode_opaque(xdr, addr->addr, addr->addr_len);
}
/* Encode as an array of strings the string given with components
@@ -2661,9 +2657,6 @@ static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
}
-/*
- * encode a location element of a fs_locations structure
- */
static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
struct nfsd4_fs_location *location)
{
@@ -2676,15 +2669,12 @@ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
status = nfsd4_encode_components(xdr, '/', location->path);
if (status)
return status;
- return 0;
+ return nfs_ok;
}
-/*
- * Encode a path in RFC3530 'pathname4' format
- */
-static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
- const struct path *root,
- const struct path *path)
+static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr,
+ const struct path *root,
+ const struct path *path)
{
struct path cur = *path;
__be32 *p;
@@ -2752,89 +2742,59 @@ out_free:
return err;
}
-static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
- struct svc_rqst *rqstp, const struct path *path)
+static __be32 nfsd4_encode_fs_locations4(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp,
+ struct svc_export *exp)
{
+ struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
struct svc_export *exp_ps;
- __be32 res;
+ unsigned int i;
+ __be32 status;
+ /* fs_root */
exp_ps = rqst_find_fsidzero_export(rqstp);
if (IS_ERR(exp_ps))
return nfserrno(PTR_ERR(exp_ps));
- res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
+ status = nfsd4_encode_pathname4(xdr, &exp_ps->ex_path, &exp->ex_path);
exp_put(exp_ps);
- return res;
-}
-
-/*
- * encode a fs_locations structure
- */
-static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
- struct svc_rqst *rqstp, struct svc_export *exp)
-{
- __be32 status;
- int i;
- __be32 *p;
- struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
-
- status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
- if (status)
+ if (status != nfs_ok)
return status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+
+ /* locations<> */
+ if (xdr_stream_encode_u32(xdr, fslocs->locations_count) != XDR_UNIT)
return nfserr_resource;
- *p++ = cpu_to_be32(fslocs->locations_count);
- for (i=0; i<fslocs->locations_count; i++) {
+ for (i = 0; i < fslocs->locations_count; i++) {
status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
- if (status)
+ if (status != nfs_ok)
return status;
}
- return 0;
-}
-static u32 nfs4_file_type(umode_t mode)
-{
- switch (mode & S_IFMT) {
- case S_IFIFO: return NF4FIFO;
- case S_IFCHR: return NF4CHR;
- case S_IFDIR: return NF4DIR;
- case S_IFBLK: return NF4BLK;
- case S_IFLNK: return NF4LNK;
- case S_IFREG: return NF4REG;
- case S_IFSOCK: return NF4SOCK;
- default: return NF4BAD;
- }
+ return nfs_ok;
}
-static inline __be32
-nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
- struct nfs4_ace *ace)
+static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ struct nfs4_ace *ace)
{
+ __be32 status;
+
+ /* type */
+ status = nfsd4_encode_acetype4(xdr, ace->type);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* flag */
+ status = nfsd4_encode_aceflag4(xdr, ace->flag);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* access mask */
+ status = nfsd4_encode_acemask4(xdr, ace->access_mask & NFS4_ACE_MASK_ALL);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* who */
if (ace->whotype != NFS4_ACL_WHO_NAMED)
return nfs4_acl_write_who(xdr, ace->whotype);
- else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+ if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
- else
- return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
-}
-
-static inline __be32
-nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
-{
- __be32 *p;
- unsigned long i = hweight_long(layout_types);
-
- p = xdr_reserve_space(xdr, 4 + 4 * i);
- if (!p)
- return nfserr_resource;
-
- *p++ = cpu_to_be32(i);
-
- for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
- if (layout_types & (1 << i))
- *p++ = cpu_to_be32(i);
-
- return 0;
+ return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
}
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -2906,12 +2866,12 @@ static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
}
static __be32
-nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
+nfsd4_encode_bitmap4(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
{
__be32 *p;
if (bmval2) {
- p = xdr_reserve_space(xdr, 16);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 4);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(3);
@@ -2919,94 +2879,687 @@ nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
*p++ = cpu_to_be32(bmval1);
*p++ = cpu_to_be32(bmval2);
} else if (bmval1) {
- p = xdr_reserve_space(xdr, 12);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 3);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(2);
*p++ = cpu_to_be32(bmval0);
*p++ = cpu_to_be32(bmval1);
} else {
- p = xdr_reserve_space(xdr, 8);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 2);
if (!p)
goto out_resource;
*p++ = cpu_to_be32(1);
*p++ = cpu_to_be32(bmval0);
}
- return 0;
+ return nfs_ok;
out_resource:
return nfserr_resource;
}
+struct nfsd4_fattr_args {
+ struct svc_rqst *rqstp;
+ struct svc_fh *fhp;
+ struct svc_export *exp;
+ struct dentry *dentry;
+ struct kstat stat;
+ struct kstatfs statfs;
+ struct nfs4_acl *acl;
+ u64 size;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ void *context;
+ int contextlen;
+#endif
+ u32 rdattr_err;
+ bool contextsupport;
+ bool ignore_crossmnt;
+};
+
+typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args);
+
+static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4__true(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_bool(xdr, true);
+}
+
+static __be32 nfsd4_encode_fattr4__false(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_bool(xdr, false);
+}
+
+static __be32 nfsd4_encode_fattr4_supported_attrs(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+ u32 minorversion = resp->cstate.minorversion;
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
+ if (!IS_POSIXACL(d_inode(args->dentry)))
+ supp[0] &= ~FATTR4_WORD0_ACL;
+ if (!args->contextsupport)
+ supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+
+ return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+static __be32 nfsd4_encode_fattr4_type(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!p)
+ return nfserr_resource;
+
+ switch (args->stat.mode & S_IFMT) {
+ case S_IFIFO:
+ *p = cpu_to_be32(NF4FIFO);
+ break;
+ case S_IFCHR:
+ *p = cpu_to_be32(NF4CHR);
+ break;
+ case S_IFDIR:
+ *p = cpu_to_be32(NF4DIR);
+ break;
+ case S_IFBLK:
+ *p = cpu_to_be32(NF4BLK);
+ break;
+ case S_IFLNK:
+ *p = cpu_to_be32(NF4LNK);
+ break;
+ case S_IFREG:
+ *p = cpu_to_be32(NF4REG);
+ break;
+ case S_IFSOCK:
+ *p = cpu_to_be32(NF4SOCK);
+ break;
+ default:
+ return nfserr_serverfault;
+ }
+
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u32 mask;
+
+ mask = NFS4_FH_PERSISTENT;
+ if (!(args->exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+ mask |= NFS4_FH_VOL_RENAME;
+ return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ const struct svc_export *exp = args->exp;
+ u64 c;
+
+ if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
+ u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
+
+ if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT)
+ return nfserr_resource;
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
+ c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry));
+ return nfsd4_encode_changeid4(xdr, c);
+}
+
+static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->size);
+}
+
+static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 2 + XDR_UNIT * 2);
+ if (!p)
+ return nfserr_resource;
+
+ if (unlikely(args->exp->ex_fslocs.migrated)) {
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+ xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
+ return nfs_ok;
+ }
+ switch (fsid_source(args->fhp)) {
+ case FSIDSOURCE_FSID:
+ p = xdr_encode_hyper(p, (u64)args->exp->ex_fsid);
+ xdr_encode_hyper(p, (u64)0);
+ break;
+ case FSIDSOURCE_DEV:
+ *p++ = xdr_zero;
+ *p++ = cpu_to_be32(MAJOR(args->stat.dev));
+ *p++ = xdr_zero;
+ *p = cpu_to_be32(MINOR(args->stat.dev));
+ break;
+ case FSIDSOURCE_UUID:
+ xdr_encode_opaque_fixed(p, args->exp->ex_uuid, EX_UUID_LEN);
+ break;
+ }
+
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_lease_time(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(args->rqstp), nfsd_net_id);
+
+ return nfsd4_encode_nfs_lease4(xdr, nn->nfsd4_lease);
+}
+
+static __be32 nfsd4_encode_fattr4_rdattr_error(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->rdattr_err);
+}
+
+static __be32 nfsd4_encode_fattr4_aclsupport(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u32 mask;
+
+ mask = 0;
+ if (IS_POSIXACL(d_inode(args->dentry)))
+ mask = ACL4_SUPPORT_ALLOW_ACL | ACL4_SUPPORT_DENY_ACL;
+ return nfsd4_encode_uint32_t(xdr, mask);
+}
+
+static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfs4_acl *acl = args->acl;
+ struct nfs4_ace *ace;
+ __be32 status;
+
+ /* nfsace4<> */
+ if (!acl) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ } else {
+ if (xdr_stream_encode_u32(xdr, acl->naces) != XDR_UNIT)
+ return nfserr_resource;
+ for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
+ status = nfsd4_encode_nfsace4(xdr, args->rqstp, ace);
+ if (status != nfs_ok)
+ return status;
+ }
+ }
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle);
+}
+
+static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->stat.ino);
+}
+
+static __be32 nfsd4_encode_fattr4_files_avail(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_free(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree);
+}
+
+static __be32 nfsd4_encode_fattr4_files_total(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, args->statfs.f_files);
+}
+
+static __be32 nfsd4_encode_fattr4_fs_locations(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_fs_locations4(xdr, args->rqstp, args->exp);
+}
+
+static __be32 nfsd4_encode_fattr4_maxfilesize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct super_block *sb = args->exp->ex_path.mnt->mnt_sb;
+
+ return nfsd4_encode_uint64_t(xdr, sb->s_maxbytes);
+}
+
+static __be32 nfsd4_encode_fattr4_maxlink(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, 255);
+}
+
+static __be32 nfsd4_encode_fattr4_maxname(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->statfs.f_namelen);
+}
+
+static __be32 nfsd4_encode_fattr4_maxread(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_maxwrite(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp));
+}
+
+static __be32 nfsd4_encode_fattr4_mode(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_mode4(xdr, args->stat.mode & S_IALLUGO);
+}
+
+static __be32 nfsd4_encode_fattr4_numlinks(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->stat.nlink);
+}
+
+static __be32 nfsd4_encode_fattr4_owner(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_user(xdr, args->rqstp, args->stat.uid);
+}
+
+static __be32 nfsd4_encode_fattr4_owner_group(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_group(xdr, args->rqstp, args->stat.gid);
+}
+
+static __be32 nfsd4_encode_fattr4_rawdev(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_specdata4(xdr, MAJOR(args->stat.rdev),
+ MINOR(args->stat.rdev));
+}
+
+static __be32 nfsd4_encode_fattr4_space_avail(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 avail = (u64)args->statfs.f_bavail * (u64)args->statfs.f_bsize;
+
+ return nfsd4_encode_uint64_t(xdr, avail);
+}
+
+static __be32 nfsd4_encode_fattr4_space_free(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 free = (u64)args->statfs.f_bfree * (u64)args->statfs.f_bsize;
+
+ return nfsd4_encode_uint64_t(xdr, free);
+}
+
+static __be32 nfsd4_encode_fattr4_space_total(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 total = (u64)args->statfs.f_blocks * (u64)args->statfs.f_bsize;
+
+ return nfsd4_encode_uint64_t(xdr, total);
+}
+
+static __be32 nfsd4_encode_fattr4_space_used(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint64_t(xdr, (u64)args->stat.blocks << 9);
+}
+
+static __be32 nfsd4_encode_fattr4_time_access(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.atime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_create(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.btime);
+}
+
+/*
+ * ctime (in NFSv4, time_metadata) is not writeable, and the client
+ * doesn't really care what resolution could theoretically be stored by
+ * the filesystem.
+ *
+ * The client cares how close together changes can be while still
+ * guaranteeing ctime changes. For most filesystems (which have
+ * timestamps with nanosecond fields) that is limited by the resolution
+ * of the time returned from current_time() (which I'm assuming to be
+ * 1/HZ).
+ */
+static __be32 nfsd4_encode_fattr4_time_delta(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ const struct inode *inode = d_inode(args->dentry);
+ u32 ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran);
+ struct timespec64 ts = ns_to_timespec64(ns);
+
+ return nfsd4_encode_nfstime4(xdr, &ts);
+}
+
+static __be32 nfsd4_encode_fattr4_time_metadata(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.ctime);
+}
+
+static __be32 nfsd4_encode_fattr4_time_modify(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_nfstime4(xdr, &args->stat.mtime);
+}
+
+static __be32 nfsd4_encode_fattr4_mounted_on_fileid(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ u64 ino;
+ int err;
+
+ if (!args->ignore_crossmnt &&
+ args->dentry == args->exp->ex_path.mnt->mnt_root) {
+ err = nfsd4_get_mounted_on_ino(args->exp, &ino);
+ if (err)
+ return nfserrno(err);
+ } else
+ ino = args->stat.ino;
+
+ return nfsd4_encode_uint64_t(xdr, ino);
+}
+
+#ifdef CONFIG_NFSD_PNFS
+
+static __be32 nfsd4_encode_fattr4_fs_layout_types(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ unsigned long mask = args->exp->ex_layout_types;
+ int i;
+
+ /* Hamming weight of @mask is the number of layout types to return */
+ if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+ return nfserr_resource;
+ for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+ if (mask & BIT(i)) {
+ /* layouttype4 */
+ if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+ return nfserr_resource;
+ }
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_types(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ unsigned long mask = args->exp->ex_layout_types;
+ int i;
+
+ /* Hamming weight of @mask is the number of layout types to return */
+ if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT)
+ return nfserr_resource;
+ for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+ if (mask & BIT(i)) {
+ /* layouttype4 */
+ if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT)
+ return nfserr_resource;
+ }
+ return nfs_ok;
+}
+
+static __be32 nfsd4_encode_fattr4_layout_blksize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_uint32_t(xdr, args->stat.blksize);
+}
+
+#endif
+
+static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct nfsd4_compoundres *resp = args->rqstp->rq_resp;
+ u32 supp[3];
+
+ memcpy(supp, nfsd_suppattrs[resp->cstate.minorversion], sizeof(supp));
+ supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
+ supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
+ supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
+
+ return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
+}
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ return nfsd4_encode_security_label(xdr, args->rqstp,
+ args->context, args->contextlen);
+}
+#endif
+
+static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ int err = xattr_supports_user_prefix(d_inode(args->dentry));
+
+ return nfsd4_encode_bool(xdr, err == 0);
+}
+
+static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
+ [FATTR4_SUPPORTED_ATTRS] = nfsd4_encode_fattr4_supported_attrs,
+ [FATTR4_TYPE] = nfsd4_encode_fattr4_type,
+ [FATTR4_FH_EXPIRE_TYPE] = nfsd4_encode_fattr4_fh_expire_type,
+ [FATTR4_CHANGE] = nfsd4_encode_fattr4_change,
+ [FATTR4_SIZE] = nfsd4_encode_fattr4_size,
+ [FATTR4_LINK_SUPPORT] = nfsd4_encode_fattr4__true,
+ [FATTR4_SYMLINK_SUPPORT] = nfsd4_encode_fattr4__true,
+ [FATTR4_NAMED_ATTR] = nfsd4_encode_fattr4__false,
+ [FATTR4_FSID] = nfsd4_encode_fattr4_fsid,
+ [FATTR4_UNIQUE_HANDLES] = nfsd4_encode_fattr4__true,
+ [FATTR4_LEASE_TIME] = nfsd4_encode_fattr4_lease_time,
+ [FATTR4_RDATTR_ERROR] = nfsd4_encode_fattr4_rdattr_error,
+ [FATTR4_ACL] = nfsd4_encode_fattr4_acl,
+ [FATTR4_ACLSUPPORT] = nfsd4_encode_fattr4_aclsupport,
+ [FATTR4_ARCHIVE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CANSETTIME] = nfsd4_encode_fattr4__true,
+ [FATTR4_CASE_INSENSITIVE] = nfsd4_encode_fattr4__false,
+ [FATTR4_CASE_PRESERVING] = nfsd4_encode_fattr4__true,
+ [FATTR4_CHOWN_RESTRICTED] = nfsd4_encode_fattr4__true,
+ [FATTR4_FILEHANDLE] = nfsd4_encode_fattr4_filehandle,
+ [FATTR4_FILEID] = nfsd4_encode_fattr4_fileid,
+ [FATTR4_FILES_AVAIL] = nfsd4_encode_fattr4_files_avail,
+ [FATTR4_FILES_FREE] = nfsd4_encode_fattr4_files_free,
+ [FATTR4_FILES_TOTAL] = nfsd4_encode_fattr4_files_total,
+ [FATTR4_FS_LOCATIONS] = nfsd4_encode_fattr4_fs_locations,
+ [FATTR4_HIDDEN] = nfsd4_encode_fattr4__noop,
+ [FATTR4_HOMOGENEOUS] = nfsd4_encode_fattr4__true,
+ [FATTR4_MAXFILESIZE] = nfsd4_encode_fattr4_maxfilesize,
+ [FATTR4_MAXLINK] = nfsd4_encode_fattr4_maxlink,
+ [FATTR4_MAXNAME] = nfsd4_encode_fattr4_maxname,
+ [FATTR4_MAXREAD] = nfsd4_encode_fattr4_maxread,
+ [FATTR4_MAXWRITE] = nfsd4_encode_fattr4_maxwrite,
+ [FATTR4_MIMETYPE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MODE] = nfsd4_encode_fattr4_mode,
+ [FATTR4_NO_TRUNC] = nfsd4_encode_fattr4__true,
+ [FATTR4_NUMLINKS] = nfsd4_encode_fattr4_numlinks,
+ [FATTR4_OWNER] = nfsd4_encode_fattr4_owner,
+ [FATTR4_OWNER_GROUP] = nfsd4_encode_fattr4_owner_group,
+ [FATTR4_QUOTA_AVAIL_HARD] = nfsd4_encode_fattr4__noop,
+ [FATTR4_QUOTA_AVAIL_SOFT] = nfsd4_encode_fattr4__noop,
+ [FATTR4_QUOTA_USED] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RAWDEV] = nfsd4_encode_fattr4_rawdev,
+ [FATTR4_SPACE_AVAIL] = nfsd4_encode_fattr4_space_avail,
+ [FATTR4_SPACE_FREE] = nfsd4_encode_fattr4_space_free,
+ [FATTR4_SPACE_TOTAL] = nfsd4_encode_fattr4_space_total,
+ [FATTR4_SPACE_USED] = nfsd4_encode_fattr4_space_used,
+ [FATTR4_SYSTEM] = nfsd4_encode_fattr4__noop,
+ [FATTR4_TIME_ACCESS] = nfsd4_encode_fattr4_time_access,
+ [FATTR4_TIME_ACCESS_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_TIME_BACKUP] = nfsd4_encode_fattr4__noop,
+ [FATTR4_TIME_CREATE] = nfsd4_encode_fattr4_time_create,
+ [FATTR4_TIME_DELTA] = nfsd4_encode_fattr4_time_delta,
+ [FATTR4_TIME_METADATA] = nfsd4_encode_fattr4_time_metadata,
+ [FATTR4_TIME_MODIFY] = nfsd4_encode_fattr4_time_modify,
+ [FATTR4_TIME_MODIFY_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MOUNTED_ON_FILEID] = nfsd4_encode_fattr4_mounted_on_fileid,
+ [FATTR4_DIR_NOTIF_DELAY] = nfsd4_encode_fattr4__noop,
+ [FATTR4_DIRENT_NOTIF_DELAY] = nfsd4_encode_fattr4__noop,
+ [FATTR4_DACL] = nfsd4_encode_fattr4__noop,
+ [FATTR4_SACL] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CHANGE_POLICY] = nfsd4_encode_fattr4__noop,
+ [FATTR4_FS_STATUS] = nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_PNFS
+ [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4_fs_layout_types,
+ [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4_layout_types,
+ [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4_layout_blksize,
+ [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop,
+#else
+ [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop,
+#endif
+
+ [FATTR4_FS_LOCATIONS_INFO] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MDSTHRESHOLD] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTION_GET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTION_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTEVT_GET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTEVT_SET] = nfsd4_encode_fattr4__noop,
+ [FATTR4_RETENTION_HOLD] = nfsd4_encode_fattr4__noop,
+ [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop,
+ [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat,
+ [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop,
+
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4_sec_label,
+#else
+ [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4__noop,
+#endif
+
+ [FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop,
+ [FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support,
+};
+
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
*/
static __be32
-nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
- struct svc_export *exp,
- struct dentry *dentry, u32 *bmval,
- struct svc_rqst *rqstp, int ignore_crossmnt)
+nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry, const u32 *bmval,
+ int ignore_crossmnt)
{
- u32 bmval0 = bmval[0];
- u32 bmval1 = bmval[1];
- u32 bmval2 = bmval[2];
- struct kstat stat;
+ struct nfsd4_fattr_args args;
struct svc_fh *tempfh = NULL;
- struct kstatfs statfs;
- __be32 *p, *attrlen_p;
int starting_len = xdr->buf->len;
+ __be32 *attrlen_p, status;
int attrlen_offset;
- u32 dummy;
- u64 dummy64;
- u32 rdattr_err = 0;
- __be32 status;
int err;
- struct nfs4_acl *acl = NULL;
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- void *context = NULL;
- int contextlen;
-#endif
- bool contextsupport = false;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
u32 minorversion = resp->cstate.minorversion;
struct path path = {
.mnt = exp->ex_path.mnt,
.dentry = dentry,
};
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ union {
+ u32 attrmask[3];
+ unsigned long mask[2];
+ } u;
+ bool file_modified;
+ unsigned long bit;
+ u64 size = 0;
+
+ WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
+ WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
+
+ args.rqstp = rqstp;
+ args.exp = exp;
+ args.dentry = dentry;
+ args.ignore_crossmnt = (ignore_crossmnt != 0);
- BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
- BUG_ON(!nfsd_attrs_supported(minorversion, bmval));
+ /*
+ * Make a local copy of the attribute bitmap that can be modified.
+ */
+ memset(&u, 0, sizeof(u));
+ u.attrmask[0] = bmval[0];
+ u.attrmask[1] = bmval[1];
+ u.attrmask[2] = bmval[2];
+ args.rdattr_err = 0;
if (exp->ex_fslocs.migrated) {
- status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
+ status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1],
+ &u.attrmask[2], &args.rdattr_err);
if (status)
goto out;
}
- if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
- status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
+ args.size = 0;
+ if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+ status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
+ &file_modified, &size);
if (status)
goto out;
}
- err = vfs_getattr(&path, &stat,
+ err = vfs_getattr(&path, &args.stat,
STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
- if (!(stat.result_mask & STATX_BTIME))
+ args.size = file_modified ? size : args.stat.size;
+
+ if (!(args.stat.result_mask & STATX_BTIME))
/* underlying FS does not offer btime so we can't share it */
- bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
- if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
- (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+ (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
- err = vfs_statfs(&path, &statfs);
+ err = vfs_statfs(&path, &args.statfs);
if (err)
goto out_nfserr;
}
- if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
+ if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
+ !fhp) {
tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
status = nfserr_jukebox;
if (!tempfh)
@@ -3015,12 +3568,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
status = fh_compose(tempfh, exp, dentry, NULL);
if (status)
goto out;
- fhp = tempfh;
- }
- if (bmval0 & FATTR4_WORD0_ACL) {
- err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+ args.fhp = tempfh;
+ } else
+ args.fhp = fhp;
+
+ args.acl = NULL;
+ if (u.attrmask[0] & FATTR4_WORD0_ACL) {
+ err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);
if (err == -EOPNOTSUPP)
- bmval0 &= ~FATTR4_WORD0_ACL;
+ u.attrmask[0] &= ~FATTR4_WORD0_ACL;
else if (err == -EINVAL) {
status = nfserr_attrnotsupp;
goto out;
@@ -3028,452 +3584,53 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
goto out_nfserr;
}
+ args.contextsupport = false;
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
- bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ args.context = NULL;
+ if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+ u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
err = security_inode_getsecctx(d_inode(dentry),
- &context, &contextlen);
+ &args.context, &args.contextlen);
else
err = -EOPNOTSUPP;
- contextsupport = (err == 0);
- if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ args.contextsupport = (err == 0);
+ if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
if (err == -EOPNOTSUPP)
- bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
else if (err)
goto out_nfserr;
}
}
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
- status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2);
+ /* attrmask */
+ status = nfsd4_encode_bitmap4(xdr, u.attrmask[0],
+ u.attrmask[1], u.attrmask[2]);
if (status)
goto out;
+ /* attr_vals */
attrlen_offset = xdr->buf->len;
attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
if (!attrlen_p)
goto out_resource;
-
- if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
- u32 supp[3];
-
- memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
-
- if (!IS_POSIXACL(dentry->d_inode))
- supp[0] &= ~FATTR4_WORD0_ACL;
- if (!contextsupport)
- supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
- if (!supp[2]) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(supp[0]);
- *p++ = cpu_to_be32(supp[1]);
- } else {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(supp[0]);
- *p++ = cpu_to_be32(supp[1]);
- *p++ = cpu_to_be32(supp[2]);
- }
- }
- if (bmval0 & FATTR4_WORD0_TYPE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- dummy = nfs4_file_type(stat.mode);
- if (dummy == NF4BAD) {
- status = nfserr_serverfault;
+ for_each_set_bit(bit, (const unsigned long *)&u.mask,
+ ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
+ status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
+ if (status != nfs_ok)
goto out;
- }
- *p++ = cpu_to_be32(dummy);
- }
- if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
- *p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
- else
- *p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
- NFS4_FH_VOL_RENAME);
}
- if (bmval0 & FATTR4_WORD0_CHANGE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = encode_change(p, &stat, d_inode(dentry), exp);
- }
- if (bmval0 & FATTR4_WORD0_SIZE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, stat.size);
- }
- if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
- if (bmval0 & FATTR4_WORD0_FSID) {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- if (exp->ex_fslocs.migrated) {
- p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
- p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
- } else switch(fsid_source(fhp)) {
- case FSIDSOURCE_FSID:
- p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
- p = xdr_encode_hyper(p, (u64)0);
- break;
- case FSIDSOURCE_DEV:
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(MAJOR(stat.dev));
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(MINOR(stat.dev));
- break;
- case FSIDSOURCE_UUID:
- p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
- EX_UUID_LEN);
- break;
- }
- }
- if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
- if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(nn->nfsd4_lease);
- }
- if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(rdattr_err);
- }
- if (bmval0 & FATTR4_WORD0_ACL) {
- struct nfs4_ace *ace;
-
- if (acl == NULL) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
-
- *p++ = cpu_to_be32(0);
- goto out_acl;
- }
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(acl->naces);
-
- for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
- p = xdr_reserve_space(xdr, 4*3);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(ace->type);
- *p++ = cpu_to_be32(ace->flag);
- *p++ = cpu_to_be32(ace->access_mask &
- NFS4_ACE_MASK_ALL);
- status = nfsd4_encode_aclname(xdr, rqstp, ace);
- if (status)
- goto out;
- }
- }
-out_acl:
- if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ?
- ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
- }
- if (bmval0 & FATTR4_WORD0_CANSETTIME) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
- if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
- p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
- if (!p)
- goto out_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
- fhp->fh_handle.fh_size);
- }
- if (bmval0 & FATTR4_WORD0_FILEID) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, stat.ino);
- }
- if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
- }
- if (bmval0 & FATTR4_WORD0_FILES_FREE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
- }
- if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) statfs.f_files);
- }
- if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
- status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
- if (status)
- goto out;
- }
- if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
- }
- if (bmval0 & FATTR4_WORD0_MAXLINK) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(255);
- }
- if (bmval0 & FATTR4_WORD0_MAXNAME) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(statfs.f_namelen);
- }
- if (bmval0 & FATTR4_WORD0_MAXREAD) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
- }
- if (bmval0 & FATTR4_WORD0_MAXWRITE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
- }
- if (bmval1 & FATTR4_WORD1_MODE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(stat.mode & S_IALLUGO);
- }
- if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- }
- if (bmval1 & FATTR4_WORD1_NUMLINKS) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(stat.nlink);
- }
- if (bmval1 & FATTR4_WORD1_OWNER) {
- status = nfsd4_encode_user(xdr, rqstp, stat.uid);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
- status = nfsd4_encode_group(xdr, rqstp, stat.gid);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_RAWDEV) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
- *p++ = cpu_to_be32((u32) MINOR(stat.rdev));
- }
- if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_SPACE_USED) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- dummy64 = (u64)stat.blocks << 9;
- p = xdr_encode_hyper(p, dummy64);
- }
- if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
- status = nfsd4_encode_nfstime4(xdr, &stat.atime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
- status = nfsd4_encode_nfstime4(xdr, &stat.btime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- p = encode_time_delta(p, d_inode(dentry));
- }
- if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
- status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
- status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
- if (status)
- goto out;
- }
- if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
- u64 ino = stat.ino;
-
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- /*
- * Get ino of mountpoint in parent filesystem, if not ignoring
- * crossmount and this is the root of a cross-mounted
- * filesystem.
- */
- if (ignore_crossmnt == 0 &&
- dentry == exp->ex_path.mnt->mnt_root) {
- err = nfsd4_get_mounted_on_ino(exp, &ino);
- if (err)
- goto out_nfserr;
- }
- p = xdr_encode_hyper(p, ino);
- }
-#ifdef CONFIG_NFSD_PNFS
- if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
- if (status)
- goto out;
- }
-
- if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
- if (status)
- goto out;
- }
-
- if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(stat.blksize);
- }
-#endif /* CONFIG_NFSD_PNFS */
- if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- u32 supp[3];
-
- memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
- supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
- supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
- supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
-
- status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
- if (status)
- goto out;
- }
-
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
- status = nfsd4_encode_security_label(xdr, rqstp, context,
- contextlen);
- if (status)
- goto out;
- }
-#endif
-
- if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- err = xattr_supports_user_prefix(d_inode(dentry));
- *p++ = cpu_to_be32(err == 0);
- }
-
*attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
status = nfs_ok;
out:
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- if (context)
- security_release_secctx(context, contextlen);
+ if (args.context)
+ security_release_secctx(args.context, args.contextlen);
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
- kfree(acl);
+ kfree(args.acl);
if (tempfh) {
fh_put(tempfh);
kfree(tempfh);
@@ -3514,12 +3671,28 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
__be32 ret;
svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
- ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
- ignore_crossmnt);
+ ret = nfsd4_encode_fattr4(rqstp, &xdr, fhp, exp, dentry, bmval,
+ ignore_crossmnt);
*p = xdr.p;
return ret;
}
+/*
+ * The buffer space for this field was reserved during a previous
+ * call to nfsd4_encode_entry4().
+ */
+static void nfsd4_encode_entry4_nfs_cookie4(const struct nfsd4_readdir *readdir,
+ u64 offset)
+{
+ __be64 cookie = cpu_to_be64(offset);
+ struct xdr_stream *xdr = readdir->xdr;
+
+ if (!readdir->cookie_offset)
+ return;
+ write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, &cookie,
+ sizeof(cookie));
+}
+
static inline int attributes_need_mount(u32 *bmval)
{
if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
@@ -3530,8 +3703,8 @@ static inline int attributes_need_mount(u32 *bmval)
}
static __be32
-nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
- const char *name, int namlen)
+nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name,
+ int namlen)
{
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
@@ -3574,33 +3747,34 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
}
out_encode:
- nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
- cd->rd_rqstp, ignore_crossmnt);
+ nfserr = nfsd4_encode_fattr4(cd->rd_rqstp, cd->xdr, NULL, exp, dentry,
+ cd->rd_bmval, ignore_crossmnt);
out_put:
dput(dentry);
exp_put(exp);
return nfserr;
}
-static __be32 *
-nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
+static __be32
+nfsd4_encode_entry4_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return NULL;
- *p++ = htonl(2);
- *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
- *p++ = htonl(0); /* bmval1 */
+ __be32 status;
- *p++ = htonl(4); /* attribute length */
- *p++ = nfserr; /* no htonl */
- return p;
+ /* attrmask */
+ status = nfsd4_encode_bitmap4(xdr, FATTR4_WORD0_RDATTR_ERROR, 0, 0);
+ if (status != nfs_ok)
+ return status;
+ /* attr_vals */
+ if (xdr_stream_encode_u32(xdr, XDR_UNIT) != XDR_UNIT)
+ return nfserr_resource;
+ /* rdattr_error */
+ if (xdr_stream_encode_be32(xdr, nfserr) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
}
static int
-nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+nfsd4_encode_entry4(void *ccdv, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct readdir_cd *ccd = ccdv;
@@ -3611,8 +3785,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
u32 name_and_cookie;
int entry_bytes;
__be32 nfserr = nfserr_toosmall;
- __be64 wire_offset;
- __be32 *p;
/* In nfsv4, "." and ".." never make it onto the wire.. */
if (name && isdotent(name, namlen)) {
@@ -3620,24 +3792,19 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
return 0;
}
- if (cd->cookie_offset) {
- wire_offset = cpu_to_be64(offset);
- write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
- &wire_offset, 8);
- }
+ /* Encode the previous entry's cookie value */
+ nfsd4_encode_entry4_nfs_cookie4(cd, offset);
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ if (xdr_stream_encode_item_present(xdr) != XDR_UNIT)
goto fail;
- *p++ = xdr_one; /* mark entry present */
+
+ /* Reserve send buffer space for this entry's cookie value. */
cookie_offset = xdr->buf->len;
- p = xdr_reserve_space(xdr, 3*4 + namlen);
- if (!p)
+ if (nfsd4_encode_nfs_cookie4(xdr, OFFSET_MAX) != nfs_ok)
goto fail;
- p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */
- p = xdr_encode_array(p, name, namlen); /* name length & name */
-
- nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
+ if (nfsd4_encode_component4(xdr, name, namlen) != nfs_ok)
+ goto fail;
+ nfserr = nfsd4_encode_entry4_fattr(cd, name, namlen);
switch (nfserr) {
case nfs_ok:
break;
@@ -3668,8 +3835,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
*/
if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
goto fail;
- p = nfsd4_encode_rdattr_error(xdr, nfserr);
- if (p == NULL) {
+ if (nfsd4_encode_entry4_rdattr_error(xdr, nfserr)) {
nfserr = nfserr_toosmall;
goto fail;
}
@@ -3727,18 +3893,26 @@ nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid)
return nfs_ok;
}
+/* This is a frequently-encoded item; open-coded for speed */
static __be32
-nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+nfsd4_encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid)
{
__be32 *p;
- p = xdr_reserve_space(xdr, sizeof(stateid_t));
+ p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE);
if (!p)
return nfserr_resource;
*p++ = cpu_to_be32(sid->si_generation);
- p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
- sizeof(stateid_opaque_t));
- return 0;
+ memcpy(p, &sid->si_opaque, sizeof(sid->si_opaque));
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_sessionid4(struct xdr_stream *xdr,
+ const struct nfs4_sessionid *sessionid)
+{
+ return nfsd4_encode_opaque_fixed(xdr, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN);
}
static __be32
@@ -3747,14 +3921,14 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_access *access = &u->access;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
+ __be32 status;
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(access->ac_supported);
- *p++ = cpu_to_be32(access->ac_resp_access);
- return 0;
+ /* supported */
+ status = nfsd4_encode_uint32_t(xdr, access->ac_supported);
+ if (status != nfs_ok)
+ return status;
+ /* access */
+ return nfsd4_encode_uint32_t(xdr, access->ac_resp_access);
}
static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
@@ -3762,17 +3936,16 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
{
struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
- if (!p)
+ /* bctsr_sessid */
+ nfserr = nfsd4_encode_sessionid4(xdr, &bcts->sessionid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* bctsr_dir */
+ if (xdr_stream_encode_u32(xdr, bcts->dir) != XDR_UNIT)
return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
- NFS4_MAX_SESSIONID_LEN);
- *p++ = cpu_to_be32(bcts->dir);
- /* Upshifting from TCP to RDMA is not supported */
- *p++ = cpu_to_be32(0);
- return 0;
+ /* bctsr_use_conn_in_rdma_mode */
+ return nfsd4_encode_bool(xdr, false);
}
static __be32
@@ -3782,7 +3955,8 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_close *close = &u->close;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &close->cl_stateid);
+ /* open_stateid */
+ return nfsd4_encode_stateid4(xdr, &close->cl_stateid);
}
@@ -3802,11 +3976,13 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_create *create = &u->create;
struct xdr_stream *xdr = resp->xdr;
+ /* cinfo */
nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo);
if (nfserr)
return nfserr;
- return nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
- create->cr_bmval[1], create->cr_bmval[2]);
+ /* attrset */
+ return nfsd4_encode_bitmap4(xdr, create->cr_bmval[0],
+ create->cr_bmval[1], create->cr_bmval[2]);
}
static __be32
@@ -3817,65 +3993,56 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
struct svc_fh *fhp = getattr->ga_fhp;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
- getattr->ga_bmval, resp->rqstp, 0);
+ /* obj_attributes */
+ return nfsd4_encode_fattr4(resp->rqstp, xdr, fhp, fhp->fh_export,
+ fhp->fh_dentry, getattr->ga_bmval, 0);
}
static __be32
nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
- struct svc_fh **fhpp = &u->getfh;
struct xdr_stream *xdr = resp->xdr;
- struct svc_fh *fhp = *fhpp;
- unsigned int len;
- __be32 *p;
+ struct svc_fh *fhp = u->getfh;
- len = fhp->fh_handle.fh_size;
- p = xdr_reserve_space(xdr, len + 4);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
- return 0;
+ /* object */
+ return nfsd4_encode_nfs_fh4(xdr, &fhp->fh_handle);
}
-/*
-* Including all fields other than the name, a LOCK4denied structure requires
-* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
-*/
static __be32
-nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
+nfsd4_encode_lock_owner4(struct xdr_stream *xdr, const clientid_t *clientid,
+ const struct xdr_netobj *owner)
{
- struct xdr_netobj *conf = &ld->ld_owner;
- __be32 *p;
+ __be32 status;
-again:
- p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
- if (!p) {
- /*
- * Don't fail to return the result just because we can't
- * return the conflicting open:
- */
- if (conf->len) {
- kfree(conf->data);
- conf->len = 0;
- conf->data = NULL;
- goto again;
- }
+ /* clientid */
+ status = nfsd4_encode_clientid4(xdr, clientid);
+ if (status != nfs_ok)
+ return status;
+ /* owner */
+ return nfsd4_encode_opaque(xdr, owner->data, owner->len);
+}
+
+static __be32
+nfsd4_encode_lock4denied(struct xdr_stream *xdr,
+ const struct nfsd4_lock_denied *ld)
+{
+ __be32 status;
+
+ /* offset */
+ status = nfsd4_encode_offset4(xdr, ld->ld_start);
+ if (status != nfs_ok)
+ return status;
+ /* length */
+ status = nfsd4_encode_length4(xdr, ld->ld_length);
+ if (status != nfs_ok)
+ return status;
+ /* locktype */
+ if (xdr_stream_encode_u32(xdr, ld->ld_type) != XDR_UNIT)
return nfserr_resource;
- }
- p = xdr_encode_hyper(p, ld->ld_start);
- p = xdr_encode_hyper(p, ld->ld_length);
- *p++ = cpu_to_be32(ld->ld_type);
- if (conf->len) {
- p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
- p = xdr_encode_opaque(p, conf->data, conf->len);
- kfree(conf->data);
- } else { /* non - nfsv4 lock in conflict, no clientid nor owner */
- p = xdr_encode_hyper(p, (u64)0); /* clientid */
- *p++ = cpu_to_be32(0); /* length of owner name */
- }
- return nfserr_denied;
+ /* owner */
+ return nfsd4_encode_lock_owner4(xdr, &ld->ld_clientid,
+ &ld->ld_owner);
}
static __be32
@@ -3884,13 +4051,21 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_lock *lock = &u->lock;
struct xdr_stream *xdr = resp->xdr;
+ __be32 status;
- if (!nfserr)
- nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
- else if (nfserr == nfserr_denied)
- nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
-
- return nfserr;
+ switch (nfserr) {
+ case nfs_ok:
+ /* resok4 */
+ status = nfsd4_encode_stateid4(xdr, &lock->lk_resp_stateid);
+ break;
+ case nfserr_denied:
+ /* denied */
+ status = nfsd4_encode_lock4denied(xdr, &lock->lk_denied);
+ break;
+ default:
+ return nfserr;
+ }
+ return status != nfs_ok ? status : nfserr;
}
static __be32
@@ -3899,9 +4074,14 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_lockt *lockt = &u->lockt;
struct xdr_stream *xdr = resp->xdr;
+ __be32 status;
- if (nfserr == nfserr_denied)
- nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
+ if (nfserr == nfserr_denied) {
+ /* denied */
+ status = nfsd4_encode_lock4denied(xdr, &lockt->lt_denied);
+ if (status != nfs_ok)
+ return status;
+ }
return nfserr;
}
@@ -3912,7 +4092,8 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_locku *locku = &u->locku;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
+ /* lock_stateid */
+ return nfsd4_encode_stateid4(xdr, &locku->lu_stateid);
}
@@ -3926,104 +4107,159 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfsd4_encode_change_info4(xdr, &link->li_cinfo);
}
+/*
+ * This implementation does not yet support returning an ACE in an
+ * OPEN that offers a delegation.
+ */
+static __be32
+nfsd4_encode_open_nfsace4(struct xdr_stream *xdr)
+{
+ __be32 status;
+
+ /* type */
+ status = nfsd4_encode_acetype4(xdr, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* flag */
+ status = nfsd4_encode_aceflag4(xdr, 0);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* access mask */
+ status = nfsd4_encode_acemask4(xdr, 0);
+ if (status != nfs_ok)
+ return nfserr_resource;
+ /* who - empty for now */
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+}
static __be32
-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_open_read_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
{
- struct nfsd4_open *open = &u->open;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
+ __be32 status;
- nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
- if (nfserr)
- return nfserr;
- nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
- if (nfserr)
- return nfserr;
- if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0)
+ /* stateid */
+ status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+ if (status != nfs_ok)
+ return status;
+ /* recall */
+ status = nfsd4_encode_bool(xdr, open->op_recall);
+ if (status != nfs_ok)
+ return status;
+ /* permissions */
+ return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_nfs_space_limit4(struct xdr_stream *xdr, u64 filesize)
+{
+ /* limitby */
+ if (xdr_stream_encode_u32(xdr, NFS4_LIMIT_SIZE) != XDR_UNIT)
return nfserr_resource;
+ /* filesize */
+ return nfsd4_encode_uint64_t(xdr, filesize);
+}
- nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
- open->op_bmval[2]);
- if (nfserr)
- return nfserr;
+static __be32
+nfsd4_encode_open_write_delegation4(struct xdr_stream *xdr,
+ struct nfsd4_open *open)
+{
+ __be32 status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ /* stateid */
+ status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid);
+ if (status != nfs_ok)
+ return status;
+ /* recall */
+ status = nfsd4_encode_bool(xdr, open->op_recall);
+ if (status != nfs_ok)
+ return status;
+ /* space_limit */
+ status = nfsd4_encode_nfs_space_limit4(xdr, 0);
+ if (status != nfs_ok)
+ return status;
+ return nfsd4_encode_open_nfsace4(xdr);
+}
+
+static __be32
+nfsd4_encode_open_none_delegation4(struct xdr_stream *xdr,
+ struct nfsd4_open *open)
+{
+ __be32 status = nfs_ok;
+
+ /* ond_why */
+ if (xdr_stream_encode_u32(xdr, open->op_why_no_deleg) != XDR_UNIT)
return nfserr_resource;
+ switch (open->op_why_no_deleg) {
+ case WND4_CONTENTION:
+ /* ond_server_will_push_deleg */
+ status = nfsd4_encode_bool(xdr, false);
+ break;
+ case WND4_RESOURCE:
+ /* ond_server_will_signal_avail */
+ status = nfsd4_encode_bool(xdr, false);
+ }
+ return status;
+}
+
+static __be32
+nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open)
+{
+ __be32 status;
- *p++ = cpu_to_be32(open->op_delegate_type);
+ /* delegation_type */
+ if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT)
+ return nfserr_resource;
switch (open->op_delegate_type) {
case NFS4_OPEN_DELEGATE_NONE:
+ status = nfs_ok;
break;
case NFS4_OPEN_DELEGATE_READ:
- nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
- if (nfserr)
- return nfserr;
- p = xdr_reserve_space(xdr, 20);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_recall);
-
- /*
- * TODO: ACE's in delegations
- */
- *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
+ /* read */
+ status = nfsd4_encode_open_read_delegation4(xdr, open);
break;
case NFS4_OPEN_DELEGATE_WRITE:
- nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
- if (nfserr)
- return nfserr;
-
- p = xdr_reserve_space(xdr, XDR_UNIT * 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_recall);
-
- /*
- * Always flush on close
- *
- * TODO: space_limit's in delegations
- */
- *p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
- *p++ = xdr_zero;
- *p++ = xdr_zero;
-
- /*
- * TODO: ACE's in delegations
- */
- *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
+ /* write */
+ status = nfsd4_encode_open_write_delegation4(xdr, open);
break;
- case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
- switch (open->op_why_no_deleg) {
- case WND4_CONTENTION:
- case WND4_RESOURCE:
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_why_no_deleg);
- /* deleg signaling not supported yet: */
- *p++ = cpu_to_be32(0);
- break;
- default:
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(open->op_why_no_deleg);
- }
+ case NFS4_OPEN_DELEGATE_NONE_EXT:
+ /* od_whynone */
+ status = nfsd4_encode_open_none_delegation4(xdr, open);
break;
default:
- BUG();
+ status = nfserr_serverfault;
}
- /* XXX save filehandle here */
- return 0;
+
+ return status;
+}
+
+static __be32
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_open *open = &u->open;
+ struct xdr_stream *xdr = resp->xdr;
+
+ /* stateid */
+ nfserr = nfsd4_encode_stateid4(xdr, &open->op_stateid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* cinfo */
+ nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* rflags */
+ nfserr = nfsd4_encode_uint32_t(xdr, open->op_rflags);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* attrset */
+ nfserr = nfsd4_encode_bitmap4(xdr, open->op_bmval[0],
+ open->op_bmval[1], open->op_bmval[2]);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* delegation */
+ return nfsd4_encode_open_delegation4(xdr, open);
}
static __be32
@@ -4033,7 +4269,8 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_open_confirm *oc = &u->open_confirm;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
+ /* open_stateid */
+ return nfsd4_encode_stateid4(xdr, &oc->oc_resp_stateid);
}
static __be32
@@ -4043,7 +4280,8 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_open_downgrade *od = &u->open_downgrade;
struct xdr_stream *xdr = resp->xdr;
- return nfsd4_encode_stateid(xdr, &od->od_stateid);
+ /* open_stateid */
+ return nfsd4_encode_stateid4(xdr, &od->od_stateid);
}
/*
@@ -4113,6 +4351,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct file *file, unsigned long maxcount)
{
struct xdr_stream *xdr = resp->xdr;
+ unsigned int base = xdr->buf->page_len & ~PAGE_MASK;
unsigned int starting_len = xdr->buf->len;
__be32 zero = xdr_zero;
__be32 nfserr;
@@ -4121,8 +4360,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
return nfserr_resource;
nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
- read->rd_offset, &maxcount,
- xdr->buf->page_len & ~PAGE_MASK,
+ read->rd_offset, &maxcount, base,
&read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
@@ -4227,90 +4465,83 @@ out_err:
return nfserr;
}
-static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+static __be32 nfsd4_encode_dirlist4(struct xdr_stream *xdr,
+ struct nfsd4_readdir *readdir,
+ u32 max_payload)
{
- struct nfsd4_readdir *readdir = &u->readdir;
- int maxcount;
- int bytes_left;
+ int bytes_left, maxcount, starting_len = xdr->buf->len;
loff_t offset;
- __be64 wire_offset;
- struct xdr_stream *xdr = resp->xdr;
- int starting_len = xdr->buf->len;
- __be32 *p;
-
- nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
- if (nfserr != nfs_ok)
- return nfserr;
+ __be32 status;
/*
* Number of bytes left for directory entries allowing for the
- * final 8 bytes of the readdir and a following failed op:
+ * final 8 bytes of the readdir and a following failed op.
*/
- bytes_left = xdr->buf->buflen - xdr->buf->len
- - COMPOUND_ERR_SLACK_SPACE - 8;
- if (bytes_left < 0) {
- nfserr = nfserr_resource;
- goto err_no_verf;
- }
- maxcount = svc_max_payload(resp->rqstp);
- maxcount = min_t(u32, readdir->rd_maxcount, maxcount);
+ bytes_left = xdr->buf->buflen - xdr->buf->len -
+ COMPOUND_ERR_SLACK_SPACE - XDR_UNIT * 2;
+ if (bytes_left < 0)
+ return nfserr_resource;
+ maxcount = min_t(u32, readdir->rd_maxcount, max_payload);
+
/*
- * Note the rfc defines rd_maxcount as the size of the
- * READDIR4resok structure, which includes the verifier above
- * and the 8 bytes encoded at the end of this function:
+ * The RFC defines rd_maxcount as the size of the
+ * READDIR4resok structure, which includes the verifier
+ * and the 8 bytes encoded at the end of this function.
*/
- if (maxcount < 16) {
- nfserr = nfserr_toosmall;
- goto err_no_verf;
- }
- maxcount = min_t(int, maxcount-16, bytes_left);
+ if (maxcount < XDR_UNIT * 4)
+ return nfserr_toosmall;
+ maxcount = min_t(int, maxcount - XDR_UNIT * 4, bytes_left);
- /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+ /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0 */
if (!readdir->rd_dircount)
- readdir->rd_dircount = svc_max_payload(resp->rqstp);
+ readdir->rd_dircount = max_payload;
+ /* *entries */
readdir->xdr = xdr;
readdir->rd_maxcount = maxcount;
readdir->common.err = 0;
readdir->cookie_offset = 0;
-
offset = readdir->rd_cookie;
- nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
- &offset,
- &readdir->common, nfsd4_encode_dirent);
- if (nfserr == nfs_ok &&
- readdir->common.err == nfserr_toosmall &&
- xdr->buf->len == starting_len + 8) {
- /* nothing encoded; which limit did we hit?: */
- if (maxcount - 16 < bytes_left)
- /* It was the fault of rd_maxcount: */
- nfserr = nfserr_toosmall;
- else
- /* We ran out of buffer space: */
- nfserr = nfserr_resource;
+ status = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, &offset,
+ &readdir->common, nfsd4_encode_entry4);
+ if (status)
+ return status;
+ if (readdir->common.err == nfserr_toosmall &&
+ xdr->buf->len == starting_len) {
+ /* No entries were encoded. Which limit did we hit? */
+ if (maxcount - XDR_UNIT * 4 < bytes_left)
+ /* It was the fault of rd_maxcount */
+ return nfserr_toosmall;
+ /* We ran out of buffer space */
+ return nfserr_resource;
}
- if (nfserr)
- goto err_no_verf;
+ /* Encode the final entry's cookie value */
+ nfsd4_encode_entry4_nfs_cookie4(readdir, offset);
+ /* No entries follow */
+ if (xdr_stream_encode_item_absent(xdr) != XDR_UNIT)
+ return nfserr_resource;
- if (readdir->cookie_offset) {
- wire_offset = cpu_to_be64(offset);
- write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
- &wire_offset, 8);
- }
+ /* eof */
+ return nfsd4_encode_bool(xdr, readdir->common.err == nfserr_eof);
+}
- p = xdr_reserve_space(xdr, 8);
- if (!p) {
- WARN_ON_ONCE(1);
- goto err_no_verf;
- }
- *p++ = 0; /* no more entries */
- *p++ = htonl(readdir->common.err == nfserr_eof);
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_readdir *readdir = &u->readdir;
+ struct xdr_stream *xdr = resp->xdr;
+ int starting_len = xdr->buf->len;
- return 0;
-err_no_verf:
- xdr_truncate_encode(xdr, starting_len);
+ /* cookieverf */
+ nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf);
+ if (nfserr != nfs_ok)
+ return nfserr;
+
+ /* reply */
+ nfserr = nfsd4_encode_dirlist4(xdr, readdir, svc_max_payload(resp->rqstp));
+ if (nfserr != nfs_ok)
+ xdr_truncate_encode(xdr, starting_len);
return nfserr;
}
@@ -4338,13 +4569,34 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
+nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr,
+ struct rpcsec_gss_info *info)
+{
+ __be32 status;
+
+ /* oid */
+ if (xdr_stream_encode_opaque(xdr, info->oid.data, info->oid.len) < 0)
+ return nfserr_resource;
+ /* qop */
+ status = nfsd4_encode_qop4(xdr, info->qop);
+ if (status != nfs_ok)
+ return status;
+ /* service */
+ if (xdr_stream_encode_u32(xdr, info->service) != XDR_UNIT)
+ return nfserr_resource;
+
+ return nfs_ok;
+}
+
+static __be32
nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
{
u32 i, nflavs, supported;
struct exp_flavor_info *flavs;
struct exp_flavor_info def_flavs[2];
- __be32 *p, *flavorsp;
static bool report = true;
+ __be32 *flavorsp;
+ __be32 status;
if (exp->ex_nflavors) {
flavs = exp->ex_flavors;
@@ -4367,10 +4619,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
}
supported = 0;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ flavorsp = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!flavorsp)
return nfserr_resource;
- flavorsp = p++; /* to be backfilled later */
for (i = 0; i < nflavs; i++) {
rpc_authflavor_t pf = flavs[i].pseudoflavor;
@@ -4378,20 +4629,22 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
if (rpcauth_get_gssinfo(pf, &info) == 0) {
supported++;
- p = xdr_reserve_space(xdr, 4 + 4 +
- XDR_LEN(info.oid.len) + 4 + 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(RPC_AUTH_GSS);
- p = xdr_encode_opaque(p, info.oid.data, info.oid.len);
- *p++ = cpu_to_be32(info.qop);
- *p++ = cpu_to_be32(info.service);
+
+ /* flavor */
+ status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS);
+ if (status != nfs_ok)
+ return status;
+ /* flavor_info */
+ status = nfsd4_encode_rpcsec_gss_info(xdr, &info);
+ if (status != nfs_ok)
+ return status;
} else if (pf < RPC_AUTH_MAXFLAVOR) {
supported++;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(pf);
+
+ /* flavor */
+ status = nfsd4_encode_uint32_t(xdr, pf);
+ if (status != nfs_ok)
+ return status;
} else {
if (report)
pr_warn("NFS: SECINFO: security flavor %u "
@@ -4401,7 +4654,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
if (nflavs != supported)
report = false;
- *flavorsp = htonl(supported);
+ *flavorsp = cpu_to_be32(supported);
return 0;
}
@@ -4425,34 +4678,25 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
}
-/*
- * The SETATTR encode routine is special -- it always encodes a bitmap,
- * regardless of the error status.
- */
static __be32
nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_setattr *setattr = &u->setattr;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
+ __be32 status;
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- return nfserr_resource;
- if (nfserr) {
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- *p++ = cpu_to_be32(0);
- }
- else {
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(setattr->sa_bmval[0]);
- *p++ = cpu_to_be32(setattr->sa_bmval[1]);
- *p++ = cpu_to_be32(setattr->sa_bmval[2]);
+ switch (nfserr) {
+ case nfs_ok:
+ /* attrsset */
+ status = nfsd4_encode_bitmap4(resp->xdr, setattr->sa_bmval[0],
+ setattr->sa_bmval[1],
+ setattr->sa_bmval[2]);
+ break;
+ default:
+ /* attrsset */
+ status = nfsd4_encode_bitmap4(resp->xdr, 0, 0, 0);
}
- return nfserr;
+ return status != nfs_ok ? status : nfserr;
}
static __be32
@@ -4488,86 +4732,148 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_write *write = &u->write;
+ struct xdr_stream *xdr = resp->xdr;
- if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0)
+ /* count */
+ nfserr = nfsd4_encode_count4(xdr, write->wr_bytes_written);
+ if (nfserr)
+ return nfserr;
+ /* committed */
+ if (xdr_stream_encode_u32(xdr, write->wr_how_written) != XDR_UNIT)
return nfserr_resource;
- return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier);
+ /* writeverf */
+ return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
}
static __be32
-nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_state_protect_ops4(struct xdr_stream *xdr,
+ struct nfsd4_exchange_id *exid)
{
- struct nfsd4_exchange_id *exid = &u->exchange_id;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- char *major_id;
- char *server_scope;
- int major_id_sz;
- int server_scope_sz;
- uint64_t minor_id = 0;
- struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+ __be32 status;
- major_id = nn->nfsd_name;
- major_id_sz = strlen(nn->nfsd_name);
- server_scope = nn->nfsd_name;
- server_scope_sz = strlen(nn->nfsd_name);
+ /* spo_must_enforce */
+ status = nfsd4_encode_bitmap4(xdr, exid->spo_must_enforce[0],
+ exid->spo_must_enforce[1],
+ exid->spo_must_enforce[2]);
+ if (status != nfs_ok)
+ return status;
+ /* spo_must_allow */
+ return nfsd4_encode_bitmap4(xdr, exid->spo_must_allow[0],
+ exid->spo_must_allow[1],
+ exid->spo_must_allow[2]);
+}
- if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok)
- return nfserr_resource;
- if (xdr_stream_encode_u32(xdr, exid->seqid) < 0)
- return nfserr_resource;
- if (xdr_stream_encode_u32(xdr, exid->flags) < 0)
- return nfserr_resource;
+static __be32
+nfsd4_encode_state_protect4_r(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid)
+{
+ __be32 status;
- if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0)
+ if (xdr_stream_encode_u32(xdr, exid->spa_how) != XDR_UNIT)
return nfserr_resource;
switch (exid->spa_how) {
case SP4_NONE:
+ status = nfs_ok;
break;
case SP4_MACH_CRED:
- /* spo_must_enforce bitmap: */
- nfserr = nfsd4_encode_bitmap(xdr,
- exid->spo_must_enforce[0],
- exid->spo_must_enforce[1],
- exid->spo_must_enforce[2]);
- if (nfserr)
- return nfserr;
- /* spo_must_allow bitmap: */
- nfserr = nfsd4_encode_bitmap(xdr,
- exid->spo_must_allow[0],
- exid->spo_must_allow[1],
- exid->spo_must_allow[2]);
- if (nfserr)
- return nfserr;
+ /* spr_mach_ops */
+ status = nfsd4_encode_state_protect_ops4(xdr, exid);
break;
default:
- WARN_ON_ONCE(1);
+ status = nfserr_serverfault;
}
+ return status;
+}
- p = xdr_reserve_space(xdr,
- 8 /* so_minor_id */ +
- 4 /* so_major_id.len */ +
- (XDR_QUADLEN(major_id_sz) * 4) +
- 4 /* eir_server_scope.len */ +
- (XDR_QUADLEN(server_scope_sz) * 4) +
- 4 /* eir_server_impl_id.count (0) */);
- if (!p)
+static __be32
+nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+
+ /* so_minor_id */
+ status = nfsd4_encode_uint64_t(xdr, 0);
+ if (status != nfs_ok)
+ return status;
+ /* so_major_id */
+ return nfsd4_encode_opaque(xdr, nn->nfsd_name, strlen(nn->nfsd_name));
+}
+
+static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id);
+ struct nfsd4_exchange_id *exid = &u->exchange_id;
+ struct xdr_stream *xdr = resp->xdr;
+
+ /* eir_clientid */
+ nfserr = nfsd4_encode_clientid4(xdr, &exid->clientid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_sequenceid */
+ nfserr = nfsd4_encode_sequenceid4(xdr, exid->seqid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_flags */
+ nfserr = nfsd4_encode_uint32_t(xdr, exid->flags);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_state_protect */
+ nfserr = nfsd4_encode_state_protect4_r(xdr, exid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_server_owner */
+ nfserr = nfsd4_encode_server_owner4(xdr, resp->rqstp);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_server_scope */
+ nfserr = nfsd4_encode_opaque(xdr, nn->nfsd_name,
+ strlen(nn->nfsd_name));
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* eir_server_impl_id<1> */
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
return nfserr_resource;
- /* The server_owner struct */
- p = xdr_encode_hyper(p, minor_id); /* Minor id */
- /* major id */
- p = xdr_encode_opaque(p, major_id, major_id_sz);
+ return nfs_ok;
+}
- /* Server scope */
- p = xdr_encode_opaque(p, server_scope, server_scope_sz);
+static __be32
+nfsd4_encode_channel_attrs4(struct xdr_stream *xdr,
+ const struct nfsd4_channel_attrs *attrs)
+{
+ __be32 status;
- /* Implementation id */
- *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */
- return 0;
+ /* ca_headerpadsize */
+ status = nfsd4_encode_count4(xdr, 0);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxrequestsize */
+ status = nfsd4_encode_count4(xdr, attrs->maxreq_sz);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxresponsesize */
+ status = nfsd4_encode_count4(xdr, attrs->maxresp_sz);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxresponsesize_cached */
+ status = nfsd4_encode_count4(xdr, attrs->maxresp_cached);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxoperations */
+ status = nfsd4_encode_count4(xdr, attrs->maxops);
+ if (status != nfs_ok)
+ return status;
+ /* ca_maxrequests */
+ status = nfsd4_encode_count4(xdr, attrs->maxreqs);
+ if (status != nfs_ok)
+ return status;
+ /* ca_rdma_ird<1> */
+ if (xdr_stream_encode_u32(xdr, attrs->nr_rdma_attrs) != XDR_UNIT)
+ return nfserr_resource;
+ if (attrs->nr_rdma_attrs)
+ return nfsd4_encode_uint32_t(xdr, attrs->rdma_attrs);
+ return nfs_ok;
}
static __be32
@@ -4576,52 +4882,25 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_create_session *sess = &u->create_session;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 24);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
- NFS4_MAX_SESSIONID_LEN);
- *p++ = cpu_to_be32(sess->seqid);
- *p++ = cpu_to_be32(sess->flags);
- p = xdr_reserve_space(xdr, 28);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(0); /* headerpadsz */
- *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
- *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
- *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
- *p++ = cpu_to_be32(sess->fore_channel.maxops);
- *p++ = cpu_to_be32(sess->fore_channel.maxreqs);
- *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
-
- if (sess->fore_channel.nr_rdma_attrs) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
- }
-
- p = xdr_reserve_space(xdr, 28);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(0); /* headerpadsz */
- *p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
- *p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
- *p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
- *p++ = cpu_to_be32(sess->back_channel.maxops);
- *p++ = cpu_to_be32(sess->back_channel.maxreqs);
- *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
-
- if (sess->back_channel.nr_rdma_attrs) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
- }
- return 0;
+ /* csr_sessionid */
+ nfserr = nfsd4_encode_sessionid4(xdr, &sess->sessionid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_sequence */
+ nfserr = nfsd4_encode_sequenceid4(xdr, sess->seqid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_flags */
+ nfserr = nfsd4_encode_uint32_t(xdr, sess->flags);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_fore_chan_attrs */
+ nfserr = nfsd4_encode_channel_attrs4(xdr, &sess->fore_channel);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* csr_back_chan_attrs */
+ return nfsd4_encode_channel_attrs4(xdr, &sess->back_channel);
}
static __be32
@@ -4630,22 +4909,35 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_sequence *seq = &u->sequence;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
- NFS4_MAX_SESSIONID_LEN);
- *p++ = cpu_to_be32(seq->seqid);
- *p++ = cpu_to_be32(seq->slotid);
+ /* sr_sessionid */
+ nfserr = nfsd4_encode_sessionid4(xdr, &seq->sessionid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_sequenceid */
+ nfserr = nfsd4_encode_sequenceid4(xdr, seq->seqid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_slotid */
+ nfserr = nfsd4_encode_slotid4(xdr, seq->slotid);
+ if (nfserr != nfs_ok)
+ return nfserr;
/* Note slotid's are numbered from zero: */
- *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
- *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
- *p++ = cpu_to_be32(seq->status_flags);
+ /* sr_highest_slotid */
+ nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_target_highest_slotid */
+ nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_status_flags */
+ nfserr = nfsd4_encode_uint32_t(xdr, seq->status_flags);
+ if (nfserr != nfs_ok)
+ return nfserr;
resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
- return 0;
+ return nfs_ok;
}
static __be32
@@ -4653,125 +4945,132 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
- struct xdr_stream *xdr = resp->xdr;
struct nfsd4_test_stateid_id *stateid, *next;
- __be32 *p;
+ struct xdr_stream *xdr = resp->xdr;
- p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
- if (!p)
+ /* tsr_status_codes<> */
+ if (xdr_stream_encode_u32(xdr, test_stateid->ts_num_ids) != XDR_UNIT)
return nfserr_resource;
- *p++ = htonl(test_stateid->ts_num_ids);
-
- list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
- *p++ = stateid->ts_id_status;
+ list_for_each_entry_safe(stateid, next,
+ &test_stateid->ts_stateid_list, ts_id_list) {
+ if (xdr_stream_encode_be32(xdr, stateid->ts_id_status) != XDR_UNIT)
+ return nfserr_resource;
}
-
- return 0;
+ return nfs_ok;
}
#ifdef CONFIG_NFSD_PNFS
static __be32
-nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_device_addr4(struct xdr_stream *xdr,
+ const struct nfsd4_getdeviceinfo *gdev)
{
- struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
- struct xdr_stream *xdr = resp->xdr;
+ u32 needed_len, starting_len = xdr->buf->len;
const struct nfsd4_layout_ops *ops;
- u32 starting_len = xdr->buf->len, needed_len;
- __be32 *p;
+ __be32 status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ /* da_layout_type */
+ if (xdr_stream_encode_u32(xdr, gdev->gd_layout_type) != XDR_UNIT)
return nfserr_resource;
-
- *p++ = cpu_to_be32(gdev->gd_layout_type);
-
+ /* da_addr_body */
ops = nfsd4_layout_ops[gdev->gd_layout_type];
- nfserr = ops->encode_getdeviceinfo(xdr, gdev);
- if (nfserr) {
+ status = ops->encode_getdeviceinfo(xdr, gdev);
+ if (status != nfs_ok) {
/*
- * We don't bother to burden the layout drivers with
- * enforcing gd_maxcount, just tell the client to
- * come back with a bigger buffer if it's not enough.
+ * Don't burden the layout drivers with enforcing
+ * gd_maxcount. Just tell the client to come back
+ * with a bigger buffer if it's not enough.
*/
- if (xdr->buf->len + 4 > gdev->gd_maxcount)
+ if (xdr->buf->len + XDR_UNIT > gdev->gd_maxcount)
goto toosmall;
- return nfserr;
+ return status;
}
- if (gdev->gd_notify_types) {
- p = xdr_reserve_space(xdr, 4 + 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(1); /* bitmap length */
- *p++ = cpu_to_be32(gdev->gd_notify_types);
- } else {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = 0;
- }
+ return nfs_ok;
- return 0;
toosmall:
- dprintk("%s: maxcount too small\n", __func__);
- needed_len = xdr->buf->len + 4 /* notifications */;
+ needed_len = xdr->buf->len + XDR_UNIT; /* notifications */
xdr_truncate_encode(xdr, starting_len);
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(needed_len);
+
+ status = nfsd4_encode_count4(xdr, needed_len);
+ if (status != nfs_ok)
+ return status;
return nfserr_toosmall;
}
static __be32
-nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
- struct nfsd4_layoutget *lgp = &u->layoutget;
+ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
struct xdr_stream *xdr = resp->xdr;
- const struct nfsd4_layout_ops *ops;
- __be32 *p;
-
- p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(1); /* we always set return-on-close */
- *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
- p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
- sizeof(stateid_opaque_t));
+ /* gdir_device_addr */
+ nfserr = nfsd4_encode_device_addr4(xdr, gdev);
+ if (nfserr)
+ return nfserr;
+ /* gdir_notification */
+ return nfsd4_encode_bitmap4(xdr, gdev->gd_notify_types, 0, 0);
+}
- *p++ = cpu_to_be32(1); /* we always return a single layout */
- p = xdr_encode_hyper(p, lgp->lg_seg.offset);
- p = xdr_encode_hyper(p, lgp->lg_seg.length);
- *p++ = cpu_to_be32(lgp->lg_seg.iomode);
- *p++ = cpu_to_be32(lgp->lg_layout_type);
+static __be32
+nfsd4_encode_layout4(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp)
+{
+ const struct nfsd4_layout_ops *ops = nfsd4_layout_ops[lgp->lg_layout_type];
+ __be32 status;
- ops = nfsd4_layout_ops[lgp->lg_layout_type];
+ /* lo_offset */
+ status = nfsd4_encode_offset4(xdr, lgp->lg_seg.offset);
+ if (status != nfs_ok)
+ return status;
+ /* lo_length */
+ status = nfsd4_encode_length4(xdr, lgp->lg_seg.length);
+ if (status != nfs_ok)
+ return status;
+ /* lo_iomode */
+ if (xdr_stream_encode_u32(xdr, lgp->lg_seg.iomode) != XDR_UNIT)
+ return nfserr_resource;
+ /* lo_content */
+ if (xdr_stream_encode_u32(xdr, lgp->lg_layout_type) != XDR_UNIT)
+ return nfserr_resource;
return ops->encode_layoutget(xdr, lgp);
}
static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
+{
+ struct nfsd4_layoutget *lgp = &u->layoutget;
+ struct xdr_stream *xdr = resp->xdr;
+
+ /* logr_return_on_close */
+ nfserr = nfsd4_encode_bool(xdr, true);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* logr_stateid */
+ nfserr = nfsd4_encode_stateid4(xdr, &lgp->lg_sid);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* logr_layout<> */
+ if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+ return nfserr_resource;
+ return nfsd4_encode_layout4(xdr, lgp);
+}
+
+static __be32
nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(lcp->lc_size_chg);
- if (lcp->lc_size_chg) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_hyper(p, lcp->lc_newsize);
- }
-
- return 0;
+ /* ns_sizechanged */
+ nfserr = nfsd4_encode_bool(xdr, lcp->lc_size_chg);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ if (lcp->lc_size_chg)
+ /* ns_size */
+ return nfsd4_encode_length4(xdr, lcp->lc_newsize);
+ return nfs_ok;
}
static __be32
@@ -4780,103 +5079,108 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(lrp->lrs_present);
+ /* lrs_present */
+ nfserr = nfsd4_encode_bool(xdr, lrp->lrs_present);
+ if (nfserr != nfs_ok)
+ return nfserr;
if (lrp->lrs_present)
- return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
- return 0;
+ /* lrs_stateid */
+ return nfsd4_encode_stateid4(xdr, &lrp->lr_sid);
+ return nfs_ok;
}
#endif /* CONFIG_NFSD_PNFS */
static __be32
-nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
- struct nfsd42_write_res *write, bool sync)
+nfsd4_encode_write_response4(struct xdr_stream *xdr,
+ const struct nfsd4_copy *copy)
{
- __be32 *p;
- p = xdr_reserve_space(resp->xdr, 4);
- if (!p)
- return nfserr_resource;
+ const struct nfsd42_write_res *write = &copy->cp_res;
+ u32 count = nfsd4_copy_is_sync(copy) ? 0 : 1;
+ __be32 status;
- if (sync)
- *p++ = cpu_to_be32(0);
- else {
- __be32 nfserr;
- *p++ = cpu_to_be32(1);
- nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid);
- if (nfserr)
- return nfserr;
+ /* wr_callback_id<1> */
+ if (xdr_stream_encode_u32(xdr, count) != XDR_UNIT)
+ return nfserr_resource;
+ if (count) {
+ status = nfsd4_encode_stateid4(xdr, &write->cb_stateid);
+ if (status != nfs_ok)
+ return status;
}
- p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
- if (!p)
+
+ /* wr_count */
+ status = nfsd4_encode_length4(xdr, write->wr_bytes_written);
+ if (status != nfs_ok)
+ return status;
+ /* wr_committed */
+ if (xdr_stream_encode_u32(xdr, write->wr_stable_how) != XDR_UNIT)
return nfserr_resource;
+ /* wr_writeverf */
+ return nfsd4_encode_verifier4(xdr, &write->wr_verifier);
+}
- p = xdr_encode_hyper(p, write->wr_bytes_written);
- *p++ = cpu_to_be32(write->wr_stable_how);
- p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
- NFS4_VERIFIER_SIZE);
- return nfs_ok;
+static __be32 nfsd4_encode_copy_requirements4(struct xdr_stream *xdr,
+ const struct nfsd4_copy *copy)
+{
+ __be32 status;
+
+ /* cr_consecutive */
+ status = nfsd4_encode_bool(xdr, true);
+ if (status != nfs_ok)
+ return status;
+ /* cr_synchronous */
+ return nfsd4_encode_bool(xdr, nfsd4_copy_is_sync(copy));
}
static __be32
-nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
- struct xdr_stream *xdr = resp->xdr;
- struct nfs42_netaddr *addr;
- __be32 *p;
+ struct nfsd4_copy *copy = &u->copy;
- p = xdr_reserve_space(xdr, 4);
- *p++ = cpu_to_be32(ns->nl4_type);
+ nfserr = nfsd4_encode_write_response4(resp->xdr, copy);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ return nfsd4_encode_copy_requirements4(resp->xdr, copy);
+}
+static __be32
+nfsd4_encode_netloc4(struct xdr_stream *xdr, const struct nl4_server *ns)
+{
+ __be32 status;
+
+ if (xdr_stream_encode_u32(xdr, ns->nl4_type) != XDR_UNIT)
+ return nfserr_resource;
switch (ns->nl4_type) {
case NL4_NETADDR:
- addr = &ns->u.nl4_addr;
-
- /* netid_len, netid, uaddr_len, uaddr (port included
- * in RPCBIND_MAXUADDRLEN)
- */
- p = xdr_reserve_space(xdr,
- 4 /* netid len */ +
- (XDR_QUADLEN(addr->netid_len) * 4) +
- 4 /* uaddr len */ +
- (XDR_QUADLEN(addr->addr_len) * 4));
- if (!p)
- return nfserr_resource;
-
- *p++ = cpu_to_be32(addr->netid_len);
- p = xdr_encode_opaque_fixed(p, addr->netid,
- addr->netid_len);
- *p++ = cpu_to_be32(addr->addr_len);
- p = xdr_encode_opaque_fixed(p, addr->addr,
- addr->addr_len);
+ /* nl_addr */
+ status = nfsd4_encode_netaddr4(xdr, &ns->u.nl4_addr);
break;
default:
- WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR);
- return nfserr_inval;
+ status = nfserr_serverfault;
}
-
- return 0;
+ return status;
}
static __be32
-nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
+nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
- struct nfsd4_copy *copy = &u->copy;
- __be32 *p;
+ struct nfsd4_copy_notify *cn = &u->copy_notify;
+ struct xdr_stream *xdr = resp->xdr;
- nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
- nfsd4_copy_is_sync(copy));
+ /* cnr_lease_time */
+ nfserr = nfsd4_encode_nfstime4(xdr, &cn->cpn_lease_time);
if (nfserr)
return nfserr;
-
- p = xdr_reserve_space(resp->xdr, 4 + 4);
- *p++ = xdr_one; /* cr_consecutive */
- *p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero;
- return 0;
+ /* cnr_stateid */
+ nfserr = nfsd4_encode_stateid4(xdr, &cn->cpn_cnr_stateid);
+ if (nfserr)
+ return nfserr;
+ /* cnr_source_server<> */
+ if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT)
+ return nfserr_resource;
+ return nfsd4_encode_netloc4(xdr, cn->cpn_src);
}
static __be32
@@ -4885,14 +5189,15 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
{
struct nfsd4_offload_status *os = &u->offload_status;
struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
- p = xdr_reserve_space(xdr, 8 + 4);
- if (!p)
+ /* osr_count */
+ nfserr = nfsd4_encode_length4(xdr, os->count);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* osr_complete<1> */
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
return nfserr_resource;
- p = xdr_encode_hyper(p, os->count);
- *p++ = cpu_to_be32(0);
- return nfserr;
+ return nfs_ok;
}
static __be32
@@ -4970,53 +5275,18 @@ out:
}
static __be32
-nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
- union nfsd4_op_u *u)
-{
- struct nfsd4_copy_notify *cn = &u->copy_notify;
- struct xdr_stream *xdr = resp->xdr;
- __be32 *p;
-
- if (nfserr)
- return nfserr;
-
- /* 8 sec, 4 nsec */
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- return nfserr_resource;
-
- /* cnr_lease_time */
- p = xdr_encode_hyper(p, cn->cpn_sec);
- *p++ = cpu_to_be32(cn->cpn_nsec);
-
- /* cnr_stateid */
- nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid);
- if (nfserr)
- return nfserr;
-
- /* cnr_src.nl_nsvr */
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
-
- *p++ = cpu_to_be32(1);
-
- nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src);
- return nfserr;
-}
-
-static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
struct nfsd4_seek *seek = &u->seek;
- __be32 *p;
-
- p = xdr_reserve_space(resp->xdr, 4 + 8);
- *p++ = cpu_to_be32(seek->seek_eof);
- p = xdr_encode_hyper(p, seek->seek_pos);
+ struct xdr_stream *xdr = resp->xdr;
- return 0;
+ /* sr_eof */
+ nfserr = nfsd4_encode_bool(xdr, seek->seek_eof);
+ if (nfserr != nfs_ok)
+ return nfserr;
+ /* sr_offset */
+ return nfsd4_encode_offset4(xdr, seek->seek_pos);
}
static __be32
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7ed02fb88a36..3e15b72f421d 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -26,6 +26,7 @@
#include "pnfs.h"
#include "filecache.h"
#include "trace.h"
+#include "netlink.h"
/*
* We have a single directory with several nodes in it.
@@ -1132,7 +1133,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
/* Following advice from simple_fill_super documentation: */
inode->i_ino = iunique(sb, NFSD_MaxReserved);
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_fop = &simple_dir_operations;
@@ -1496,6 +1497,203 @@ static int create_proc_exports_entry(void)
unsigned int nfsd_net_id;
/**
+ * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ * %0: The rpc_status_get command may proceed
+ * %-ENODEV: There is no NFSD running in this namespace
+ */
+int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb)
+{
+ struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id);
+ int ret = -ENODEV;
+
+ mutex_lock(&nfsd_mutex);
+ if (nn->nfsd_serv) {
+ svc_get(nn->nfsd_serv);
+ ret = 0;
+ }
+ mutex_unlock(&nfsd_mutex);
+
+ return ret;
+}
+
+static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nfsd_genl_rqstp *rqstp)
+{
+ void *hdr;
+ u32 i;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) ||
+ nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+ nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME,
+ ktime_to_us(rqstp->rq_stime),
+ NFSD_A_RPC_STATUS_PAD))
+ return -ENOBUFS;
+
+ switch (rqstp->rq_saddr.sa_family) {
+ case AF_INET: {
+ const struct sockaddr_in *s_in, *d_in;
+
+ s_in = (const struct sockaddr_in *)&rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in *)&rqstp->rq_daddr;
+ if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4,
+ s_in->sin_addr.s_addr) ||
+ nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4,
+ d_in->sin_addr.s_addr) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+ s_in->sin_port) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+ d_in->sin_port))
+ return -ENOBUFS;
+ break;
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *s_in, *d_in;
+
+ s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr;
+ if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6,
+ &s_in->sin6_addr) ||
+ nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6,
+ &d_in->sin6_addr) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT,
+ s_in->sin6_port) ||
+ nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT,
+ d_in->sin6_port))
+ return -ENOBUFS;
+ break;
+ }
+ }
+
+ for (i = 0; i < rqstp->rq_opcnt; i++)
+ if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS,
+ rqstp->rq_opnum[i]))
+ return -ENOBUFS;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit
+ * @skb: reply buffer
+ * @cb: netlink metadata and command arguments
+ *
+ * Returns the size of the reply or a negative errno.
+ */
+int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id);
+ int i, ret, rqstp_index = 0;
+
+ rcu_read_lock();
+
+ for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) {
+ struct svc_rqst *rqstp;
+
+ if (i < cb->args[0]) /* already consumed */
+ continue;
+
+ rqstp_index = 0;
+ list_for_each_entry_rcu(rqstp,
+ &nn->nfsd_serv->sv_pools[i].sp_all_threads,
+ rq_all) {
+ struct nfsd_genl_rqstp genl_rqstp;
+ unsigned int status_counter;
+
+ if (rqstp_index++ < cb->args[1]) /* already consumed */
+ continue;
+ /*
+ * Acquire rq_status_counter before parsing the rqst
+ * fields. rq_status_counter is set to an odd value in
+ * order to notify the consumers the rqstp fields are
+ * meaningful.
+ */
+ status_counter =
+ smp_load_acquire(&rqstp->rq_status_counter);
+ if (!(status_counter & 1))
+ continue;
+
+ genl_rqstp.rq_xid = rqstp->rq_xid;
+ genl_rqstp.rq_flags = rqstp->rq_flags;
+ genl_rqstp.rq_vers = rqstp->rq_vers;
+ genl_rqstp.rq_prog = rqstp->rq_prog;
+ genl_rqstp.rq_proc = rqstp->rq_proc;
+ genl_rqstp.rq_stime = rqstp->rq_stime;
+ genl_rqstp.rq_opcnt = 0;
+ memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp),
+ sizeof(struct sockaddr));
+ memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp),
+ sizeof(struct sockaddr));
+
+#ifdef CONFIG_NFSD_V4
+ if (rqstp->rq_vers == NFS4_VERSION &&
+ rqstp->rq_proc == NFSPROC4_COMPOUND) {
+ /* NFSv4 compound */
+ struct nfsd4_compoundargs *args;
+ int j;
+
+ args = rqstp->rq_argp;
+ genl_rqstp.rq_opcnt = args->opcnt;
+ for (j = 0; j < genl_rqstp.rq_opcnt; j++)
+ genl_rqstp.rq_opnum[j] =
+ args->ops[j].opnum;
+ }
+#endif /* CONFIG_NFSD_V4 */
+
+ /*
+ * Acquire rq_status_counter before reporting the rqst
+ * fields to the user.
+ */
+ if (smp_load_acquire(&rqstp->rq_status_counter) !=
+ status_counter)
+ continue;
+
+ ret = nfsd_genl_rpc_status_compose_msg(skb, cb,
+ &genl_rqstp);
+ if (ret)
+ goto out;
+ }
+ }
+
+ cb->args[0] = i;
+ cb->args[1] = rqstp_index;
+ ret = skb->len;
+out:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing
+ * @cb: netlink metadata and command arguments
+ *
+ * Return values:
+ * %0: Success
+ */
+int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
+{
+ mutex_lock(&nfsd_mutex);
+ nfsd_put(sock_net(cb->skb->sk));
+ mutex_unlock(&nfsd_mutex);
+
+ return 0;
+}
+
+/**
* nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
* @net: a freshly-created network namespace
*
@@ -1589,6 +1787,10 @@ static int __init init_nfsd(void)
retval = register_filesystem(&nfsd_fs_type);
if (retval)
goto out_free_all;
+ retval = genl_register_family(&nfsd_nl_family);
+ if (retval)
+ goto out_free_all;
+
return 0;
out_free_all:
nfsd4_destroy_laundry_wq();
@@ -1613,6 +1815,7 @@ out_free_slabs:
static void __exit exit_nfsd(void)
{
+ genl_unregister_family(&nfsd_nl_family);
unregister_filesystem(&nfsd_fs_type);
nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 11c14faa6c67..f5ff42f41ee7 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -62,6 +62,23 @@ struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
};
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND 50
+
+struct nfsd_genl_rqstp {
+ struct sockaddr rq_daddr;
+ struct sockaddr rq_saddr;
+ unsigned long rq_flags;
+ ktime_t rq_stime;
+ __be32 rq_xid;
+ u32 rq_vers;
+ u32 rq_prog;
+ u32 rq_proc;
+
+ /* NFSv4 compound */
+ u32 rq_opcnt;
+ u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
+};
extern struct svc_program nfsd_program;
extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 355bf0db3235..dbfa0ac13564 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -771,7 +771,7 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
* assume that the new change attr is always logged to stable storage in some
* fashion before the results can be seen.
*/
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
+u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode)
{
u64 chattr;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 40426f899e76..6ebdf7ea27bf 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -293,7 +293,8 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
fhp->fh_pre_saved = false;
}
-u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
+u64 nfsd4_change_attribute(const struct kstat *stat,
+ const struct inode *inode);
__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp);
__be32 fh_fill_post_attrs(struct svc_fh *fhp);
__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1582af33e204..d6122bb2d167 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -572,7 +572,6 @@ static void nfsd_last_thread(struct net *net)
return;
nfsd_shutdown_net(net);
- pr_info("nfsd: last server has exited, flushing export cache\n");
nfsd_export_flush(net);
}
@@ -713,14 +712,13 @@ int nfsd_nrpools(struct net *net)
int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
{
- int i = 0;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv = nn->nfsd_serv;
+ int i;
- if (nn->nfsd_serv != NULL) {
- for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
- nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
- }
-
+ if (serv)
+ for (i = 0; i < serv->sv_nrpools && i < n; i++)
+ nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads);
return 0;
}
@@ -787,7 +785,6 @@ int
nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
{
int error;
- bool nfsd_up_before;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_serv *serv;
@@ -807,8 +804,6 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
error = nfsd_create_serv(net);
if (error)
goto out;
-
- nfsd_up_before = nn->nfsd_net_up;
serv = nn->nfsd_serv;
error = nfsd_startup_net(net, cred);
@@ -816,17 +811,15 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
goto out_put;
error = svc_set_num_threads(serv, NULL, nrservs);
if (error)
- goto out_shutdown;
+ goto out_put;
error = serv->sv_nrthreads;
- if (error == 0)
- nfsd_last_thread(net);
-out_shutdown:
- if (error < 0 && !nfsd_up_before)
- nfsd_shutdown_net(net);
out_put:
/* Threads now hold service active */
if (xchg(&nn->keep_active, 0))
svc_put(serv);
+
+ if (serv->sv_nrthreads == 0)
+ nfsd_last_thread(net);
svc_put(serv);
out:
mutex_unlock(&nfsd_mutex);
@@ -957,7 +950,7 @@ nfsd(void *vrqstp)
/*
* The main request loop
*/
- while (!kthread_should_stop()) {
+ while (!svc_thread_should_stop(rqstp)) {
/* Update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nn->max_connections;
@@ -998,6 +991,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
+ /*
+ * Release rq_status_counter setting it to an odd value after the rpc
+ * request has been properly parsed. rq_status_counter is used to
+ * notify the consumers if the rqstp fields are stable
+ * (rq_status_counter is odd) or not meaningful (rq_status_counter
+ * is even).
+ */
+ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
+
rp = NULL;
switch (nfsd_cache_lookup(rqstp, &rp)) {
case RC_DOIT:
@@ -1015,6 +1017,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
+ /*
+ * Release rq_status_counter setting it to an even value after the rpc
+ * request has been properly processed.
+ */
+ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
+
nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
out_cached_reply:
return 1;
@@ -1082,11 +1090,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
int nfsd_pool_stats_release(struct inode *inode, struct file *file)
{
+ struct seq_file *seq = file->private_data;
+ struct svc_serv *serv = seq->private;
int ret = seq_release(inode, file);
- struct net *net = inode->i_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
- nfsd_put(net);
+ svc_put(serv);
mutex_unlock(&nfsd_mutex);
return ret;
}
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 4f4282d4eeca..de1e0dfed06a 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -27,12 +27,12 @@ struct nfsd4_layout_ops {
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdevp);
__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
- struct nfsd4_getdeviceinfo *gdevp);
+ const struct nfsd4_getdeviceinfo *gdevp);
__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
struct nfsd4_layoutget *lgp);
- __be32 (*encode_layoutget)(struct xdr_stream *,
- struct nfsd4_layoutget *lgp);
+ __be32 (*encode_layoutget)(struct xdr_stream *xdr,
+ const struct nfsd4_layoutget *lgp);
__be32 (*proc_layoutcommit)(struct inode *inode,
struct nfsd4_layoutcommit *lcp);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index cbddcf484dba..f96eaa8e9413 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -117,6 +117,24 @@ struct nfs4_cpntf_state {
time64_t cpntf_time; /* last time stateid used */
};
+struct nfs4_cb_fattr {
+ struct nfsd4_callback ncf_getattr;
+ u32 ncf_cb_status;
+ u32 ncf_cb_bmap[1];
+
+ /* from CB_GETATTR reply */
+ u64 ncf_cb_change;
+ u64 ncf_cb_fsize;
+
+ unsigned long ncf_cb_flags;
+ bool ncf_file_modified;
+ u64 ncf_initial_cinfo;
+ u64 ncf_cur_fsize;
+};
+
+/* bits for ncf_cb_flags */
+#define CB_GETATTR_BUSY 0
+
/*
* Represents a delegation stateid. The nfs4_client holds references to these
* and they are put when it is being destroyed or when the delegation is
@@ -150,6 +168,9 @@ struct nfs4_delegation {
int dl_retries;
struct nfsd4_callback dl_recall;
bool dl_recalled;
+
+ /* for CB_GETATTR */
+ struct nfs4_cb_fattr dl_cb_fattr;
};
#define cb_to_delegation(cb) \
@@ -174,8 +195,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
/* Maximum number of slots per session. 160 is useful for long haul TCP */
#define NFSD_MAX_SLOTS_PER_SESSION 160
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND 50
/* Maximum session per slot cache size */
#define NFSD_SLOT_CACHE_SIZE 2048
/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
@@ -642,6 +661,7 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_SEQUENCE,
NFSPROC4_CLNT_CB_NOTIFY_LOCK,
NFSPROC4_CLNT_CB_RECALL_ANY,
+ NFSPROC4_CLNT_CB_GETATTR,
};
/* Returns true iff a is later than b: */
@@ -734,5 +754,6 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
}
extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
- struct inode *inode);
+ struct inode *inode, bool *file_modified, u64 *size);
+extern void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 63797635e1c3..12d79f5d4eb1 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -60,7 +60,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
#ifdef CONFIG_NFSD_V4
/* Show count for individual nfsv4 operations */
/* Writing operation numbers 0 1 2 also for maintaining uniformity */
- seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+ seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
for (i = 0; i <= LAST_NFS4_OP; i++) {
seq_printf(seq, " %lld",
percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
@@ -76,7 +76,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num)
{
int i, err = 0;
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index cf5524e7ca06..14f50c660b61 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -37,9 +37,9 @@ extern struct nfsd_stats nfsdstats;
extern struct svc_stat nfsd_svcstats;
-int nfsd_percpu_counters_init(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num);
-void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num);
+int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
+void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
int nfsd_stat_init(void);
void nfsd_stat_shutdown(void);
@@ -61,22 +61,22 @@ static inline void nfsd_stats_rc_nocache_inc(void)
static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
{
percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
- if (exp)
- percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]);
+ if (exp && exp->ex_stats)
+ percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
}
static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
{
percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
- if (exp)
- percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount);
+ if (exp && exp->ex_stats)
+ percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
}
static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
{
percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
- if (exp)
- percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount);
+ if (exp && exp->ex_stats)
+ percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
}
static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 803904348871..fbc0ccb40424 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1863,6 +1863,93 @@ TRACE_EVENT(nfsd_end_grace,
)
);
+DECLARE_EVENT_CLASS(nfsd_copy_class,
+ TP_PROTO(
+ const struct nfsd4_copy *copy
+ ),
+ TP_ARGS(copy),
+ TP_STRUCT__entry(
+ __field(bool, intra)
+ __field(bool, async)
+ __field(u32, src_cl_boot)
+ __field(u32, src_cl_id)
+ __field(u32, src_so_id)
+ __field(u32, src_si_generation)
+ __field(u32, dst_cl_boot)
+ __field(u32, dst_cl_id)
+ __field(u32, dst_so_id)
+ __field(u32, dst_si_generation)
+ __field(u64, src_cp_pos)
+ __field(u64, dst_cp_pos)
+ __field(u64, cp_count)
+ __sockaddr(addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ const stateid_t *src_stp = &copy->cp_src_stateid;
+ const stateid_t *dst_stp = &copy->cp_dst_stateid;
+
+ __entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+ __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+ __entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot;
+ __entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id;
+ __entry->src_so_id = src_stp->si_opaque.so_id;
+ __entry->src_si_generation = src_stp->si_generation;
+ __entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot;
+ __entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id;
+ __entry->dst_so_id = dst_stp->si_opaque.so_id;
+ __entry->dst_si_generation = dst_stp->si_generation;
+ __entry->src_cp_pos = copy->cp_src_pos;
+ __entry->dst_cp_pos = copy->cp_dst_pos;
+ __entry->cp_count = copy->cp_count;
+ __assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("client=%pISpc intra=%d async=%d "
+ "src_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+ "dst_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+ "cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu",
+ __get_sockaddr(addr), __entry->intra, __entry->async,
+ __entry->src_si_generation, __entry->src_cl_boot,
+ __entry->src_cl_id, __entry->src_so_id,
+ __entry->dst_si_generation, __entry->dst_cl_boot,
+ __entry->dst_cl_id, __entry->dst_so_id,
+ __entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count
+ )
+);
+
+#define DEFINE_COPY_EVENT(name) \
+DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name, \
+ TP_PROTO(const struct nfsd4_copy *copy), \
+ TP_ARGS(copy))
+
+DEFINE_COPY_EVENT(inter);
+DEFINE_COPY_EVENT(intra);
+DEFINE_COPY_EVENT(do_async);
+
+TRACE_EVENT(nfsd_copy_done,
+ TP_PROTO(
+ const struct nfsd4_copy *copy,
+ __be32 status
+ ),
+ TP_ARGS(copy, status),
+ TP_STRUCT__entry(
+ __field(int, status)
+ __field(bool, intra)
+ __field(bool, async)
+ __sockaddr(addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ __entry->status = be32_to_cpu(status);
+ __entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+ __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+ __assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("addr=%pISpc status=%d intra=%d async=%d ",
+ __get_sockaddr(addr), __entry->status, __entry->intra, __entry->async
+ )
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 48260cf68fde..fbbea7498f02 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -337,6 +337,24 @@ out:
return err;
}
+static void
+commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp,
+ int err)
+{
+ switch (err) {
+ case -EAGAIN:
+ case -ESTALE:
+ /*
+ * Neither of these are the result of a problem with
+ * durable storage, so avoid a write verifier reset.
+ */
+ break;
+ default:
+ nfsd_reset_write_verifier(nn);
+ trace_nfsd_writeverf_reset(nn, rqstp, err);
+ }
+}
+
/*
* Commit metadata changes to stable storage.
*/
@@ -520,7 +538,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_sanitize_attrs(inode, iap);
- if (check_guard && guardtime != inode_get_ctime(inode).tv_sec)
+ if (check_guard && guardtime != inode_get_ctime_sec(inode))
return nfserr_notsync;
/*
@@ -647,8 +665,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
&nfsd4_get_cstate(rqstp)->current_fh,
dst_pos,
count, status);
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, status);
+ commit_reset_write_verifier(nn, rqstp, status);
ret = nfserrno(status);
}
}
@@ -823,7 +840,7 @@ int nfsd_open_break_lease(struct inode *inode, int access)
* and additional flags.
* N.B. After this call fhp needs an fh_put
*/
-static __be32
+static int
__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
int may_flags, struct file **filp)
{
@@ -831,14 +848,12 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
struct inode *inode;
struct file *file;
int flags = O_RDONLY|O_LARGEFILE;
- __be32 err;
- int host_err = 0;
+ int host_err = -EPERM;
path.mnt = fhp->fh_export->ex_path.mnt;
path.dentry = fhp->fh_dentry;
inode = d_inode(path.dentry);
- err = nfserr_perm;
if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
goto out;
@@ -847,7 +862,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
host_err = nfsd_open_break_lease(inode, may_flags);
if (host_err) /* NOMEM or WOULDBLOCK */
- goto out_nfserr;
+ goto out;
if (may_flags & NFSD_MAY_WRITE) {
if (may_flags & NFSD_MAY_READ)
@@ -859,13 +874,13 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
file = dentry_open(&path, flags, current_cred());
if (IS_ERR(file)) {
host_err = PTR_ERR(file);
- goto out_nfserr;
+ goto out;
}
host_err = ima_file_check(file, may_flags);
if (host_err) {
fput(file);
- goto out_nfserr;
+ goto out;
}
if (may_flags & NFSD_MAY_64BIT_COOKIE)
@@ -874,10 +889,8 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
file->f_mode |= FMODE_32BITHASH;
*filp = file;
-out_nfserr:
- err = nfserrno(host_err);
out:
- return err;
+ return host_err;
}
__be32
@@ -885,6 +898,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
int may_flags, struct file **filp)
{
__be32 err;
+ int host_err;
bool retried = false;
validate_process_creds();
@@ -904,12 +918,13 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
retry:
err = fh_verify(rqstp, fhp, type, may_flags);
if (!err) {
- err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
- if (err == nfserr_stale && !retried) {
+ host_err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ if (host_err == -EOPENSTALE && !retried) {
retried = true;
fh_put(fhp);
goto retry;
}
+ err = nfserrno(host_err);
}
validate_process_creds();
return err;
@@ -922,13 +937,13 @@ retry:
* @may_flags: internal permission flags
* @filp: OUT: open "struct file *"
*
- * Returns an nfsstat value in network byte order.
+ * Returns zero on success, or a negative errno value.
*/
-__be32
+int
nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
struct file **filp)
{
- __be32 err;
+ int err;
validate_process_creds();
err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
@@ -1172,8 +1187,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
host_err = vfs_iter_write(file, &iter, &pos, flags);
file_end_write(file);
if (host_err < 0) {
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, host_err);
+ commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
}
*cnt = host_err;
@@ -1185,10 +1199,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (stable && use_wgather) {
host_err = wait_for_concurrent_writes(file);
- if (host_err < 0) {
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, host_err);
- }
+ if (host_err < 0)
+ commit_reset_write_verifier(nn, rqstp, host_err);
}
out_nfserr:
@@ -1331,8 +1343,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
err = nfserr_notsupp;
break;
default:
- nfsd_reset_write_verifier(nn);
- trace_nfsd_writeverf_reset(nn, rqstp, err2);
+ commit_reset_write_verifier(nn, rqstp, err2);
err = nfserrno(err2);
}
} else
@@ -1788,6 +1799,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
goto out;
+ err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+ if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+ goto out;
+ if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+ goto out;
+
retry:
host_err = fh_want_write(ffhp);
if (host_err) {
@@ -1823,12 +1840,6 @@ retry:
if (ndentry == trap)
goto out_dput_new;
- host_err = -EXDEV;
- if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
- goto out_dput_new;
- if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
- goto out_dput_new;
-
if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
nfsd_has_cached_files(ndentry)) {
close_cached = true;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a6890ea7b765..e3c29596f4df 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -104,8 +104,8 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int nfsd_open_break_lease(struct inode *, int);
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
- int, struct file **);
+int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ int may_flags, struct file **filp);
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
unsigned long *count,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9d918a79dc16..80e859dc84d8 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -50,6 +50,134 @@
#define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f))
#define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f))
+/**
+ * nfsd4_encode_bool - Encode an XDR bool type result
+ * @xdr: target XDR stream
+ * @val: boolean value to encode
+ *
+ * Return values:
+ * %nfs_ok: @val encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_bool(struct xdr_stream *xdr, bool val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ *p = val ? xdr_one : xdr_zero;
+ return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_uint32_t - Encode an XDR uint32_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ * %nfs_ok: @val encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint32_t(struct xdr_stream *xdr, u32 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ *p = cpu_to_be32(val);
+ return nfs_ok;
+}
+
+#define nfsd4_encode_aceflag4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acemask4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_acetype4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_count4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_mode4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_nfs_lease4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_qop4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_sequenceid4(x, v) nfsd4_encode_uint32_t(x, v)
+#define nfsd4_encode_slotid4(x, v) nfsd4_encode_uint32_t(x, v)
+
+/**
+ * nfsd4_encode_uint64_t - Encode an XDR uint64_t type result
+ * @xdr: target XDR stream
+ * @val: integer value to encode
+ *
+ * Return values:
+ * %nfs_ok: @val encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val)
+{
+ __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ put_unaligned_be64(val, p);
+ return nfs_ok;
+}
+
+#define nfsd4_encode_changeid4(x, v) nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_nfs_cookie4(x, v) nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_length4(x, v) nfsd4_encode_uint64_t(x, v)
+#define nfsd4_encode_offset4(x, v) nfsd4_encode_uint64_t(x, v)
+
+/**
+ * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ * %nfs_ok: @data encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque_fixed(struct xdr_stream *xdr, const void *data,
+ size_t size)
+{
+ __be32 *p = xdr_reserve_space(xdr, xdr_align_size(size));
+ size_t pad = xdr_pad_size(size);
+
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ memcpy(p, data, size);
+ if (pad)
+ memset((char *)p + size, 0, pad);
+ return nfs_ok;
+}
+
+/**
+ * nfsd4_encode_opaque - Encode a variable-length XDR opaque type result
+ * @xdr: target XDR stream
+ * @data: pointer to data
+ * @size: length of data in bytes
+ *
+ * Return values:
+ * %nfs_ok: @data encoded; @xdr advanced to next position
+ * %nfserr_resource: stream buffer space exhausted
+ */
+static __always_inline __be32
+nfsd4_encode_opaque(struct xdr_stream *xdr, const void *data, size_t size)
+{
+ size_t pad = xdr_pad_size(size);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(size));
+ if (unlikely(p == NULL))
+ return nfserr_resource;
+ *p++ = cpu_to_be32(size);
+ memcpy(p, data, size);
+ if (pad)
+ memset((char *)p + size, 0, pad);
+ return nfs_ok;
+}
+
+#define nfsd4_encode_component4(x, d, s) nfsd4_encode_opaque(x, d, s)
+
struct nfsd4_compound_state {
struct svc_fh current_fh;
struct svc_fh save_fh;
@@ -170,12 +298,8 @@ struct nfsd4_lock {
} v;
/* response */
- union {
- struct {
- stateid_t stateid;
- } ok;
- struct nfsd4_lock_denied denied;
- } u;
+ stateid_t lk_resp_stateid;
+ struct nfsd4_lock_denied lk_denied;
};
#define lk_new_open_seqid v.new.open_seqid
#define lk_new_open_stateid v.new.open_stateid
@@ -185,20 +309,15 @@ struct nfsd4_lock {
#define lk_old_lock_stateid v.old.lock_stateid
#define lk_old_lock_seqid v.old.lock_seqid
-#define lk_resp_stateid u.ok.stateid
-#define lk_denied u.denied
-
-
struct nfsd4_lockt {
u32 lt_type;
clientid_t lt_clientid;
struct xdr_netobj lt_owner;
u64 lt_offset;
u64 lt_length;
- struct nfsd4_lock_denied lt_denied;
+ struct nfsd4_lock_denied lt_denied;
};
-
struct nfsd4_locku {
u32 lu_type;
u32 lu_seqid;
@@ -267,9 +386,9 @@ struct nfsd4_open {
u32 op_deleg_want; /* request */
stateid_t op_stateid; /* response */
__be32 op_xdr_error; /* see nfsd4_open_omfg() */
- u32 op_recall; /* recall */
struct nfsd4_change_info op_cinfo; /* response */
u32 op_rflags; /* response */
+ bool op_recall; /* response */
bool op_truncate; /* used during processing */
bool op_created; /* used during processing */
struct nfs4_openowner *op_openowner; /* used during processing */
@@ -496,7 +615,7 @@ struct nfsd4_layoutcommit {
u32 lc_layout_type; /* request */
u32 lc_up_len; /* layout length */
void *lc_up_layout; /* decoded by callback */
- u32 lc_size_chg; /* boolean for response */
+ bool lc_size_chg; /* response */
u64 lc_newsize; /* response */
};
@@ -508,7 +627,7 @@ struct nfsd4_layoutreturn {
u32 lrf_body_len; /* request */
void *lrf_body; /* request */
stateid_t lr_sid; /* request/response */
- u32 lrs_present; /* response */
+ bool lrs_present; /* response */
};
struct nfsd4_fallocate {
@@ -626,8 +745,7 @@ struct nfsd4_copy_notify {
/* response */
stateid_t cpn_cnr_stateid;
- u64 cpn_sec;
- u32 cpn_nsec;
+ struct timespec64 cpn_lease_time;
struct nl4_server *cpn_src;
};
@@ -820,8 +938,10 @@ extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, union nfsd4_op_u *u);
extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
union nfsd4_op_u *u);
+extern void nfsd4_lock_release(union nfsd4_op_u *u);
extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
union nfsd4_op_u *u);
+extern void nfsd4_lockt_release(union nfsd4_op_u *u);
extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
union nfsd4_op_u *u);
extern __be32
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 0d39af1b00a0..e8b00309c449 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -54,3 +54,21 @@
#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+
+/*
+ * 1: CB_GETATTR opcode (32-bit)
+ * N: file_handle
+ * 1: number of entry in attribute array (32-bit)
+ * 1: entry 0 in attribute array (32-bit)
+ */
+#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + enc_nfs4_fh_sz + 1 + 1)
+/*
+ * 4: fattr_bitmap_maxsz
+ * 1: attribute array len
+ * 2: change attr (64-bit)
+ * 2: size (64-bit)
+ */
+#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index bce734b68f08..de2073c47651 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, mapping, from, to);
nilfs_put_page(page);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
/*
@@ -519,7 +519,7 @@ got_it:
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, page->mapping, from, to);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
nilfs_mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
@@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
nilfs_commit_chunk(page, mapping, from, to);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
out:
nilfs_put_page(page);
return err;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 48fe71d309cb..8beb2730929d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
- if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
- brelse(bh);
+ if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */
goto failed;
- }
}
lock_buffer(bh);
@@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
failed:
unlock_page(bh->b_page);
put_page(bh->b_page);
+ if (unlikely(err))
+ brelse(bh);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 1a8bd5993476..f861f3a0bf5c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
atomic64_inc(&root->inodes_count);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = ino;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
@@ -449,12 +449,12 @@ int nilfs_read_inode_common(struct inode *inode,
i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le64_to_cpu(raw_inode->i_size);
- inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+ inode_set_atime(inode, le64_to_cpu(raw_inode->i_mtime),
+ le32_to_cpu(raw_inode->i_mtime_nsec));
inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime),
le32_to_cpu(raw_inode->i_ctime_nsec));
- inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ inode_set_mtime(inode, le64_to_cpu(raw_inode->i_mtime),
+ le32_to_cpu(raw_inode->i_mtime_nsec));
if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
return -EIO; /* this inode is for metadata and corrupted */
if (inode->i_nlink == 0)
@@ -768,10 +768,10 @@ void nilfs_write_inode_common(struct inode *inode,
raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le64(inode->i_size);
- raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ raw_inode->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ raw_inode->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+ raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ raw_inode->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
raw_inode->i_flags = cpu_to_le32(ii->i_flags);
@@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode)
nilfs_truncate_bmap(ii, blkoff);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..869b016014d2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
struct dnotify_struct *dn;
struct inode *inode;
fl_owner_t id = current->files;
- struct file *f;
+ struct file *f = NULL;
int destroy = 0, error = 0;
__u32 mask;
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
}
rcu_read_lock();
- f = lookup_fd_rcu(fd);
+ f = lookup_fdget_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
fsnotify_put_mark(new_fsn_mark);
if (dn)
kmem_cache_free(dnotify_struct_cache, dn);
+ if (f)
+ fput(f);
return error;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f69c451018e3..62fe0b679e58 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1585,16 +1585,25 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
}
/* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct dentry *dentry)
+static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ const struct export_operations *nop = dentry->d_sb->s_export_op;
+
+ /*
+ * We need to make sure that the filesystem supports encoding of
+ * file handles so user can use name_to_handle_at() to compare fids
+ * reported with events to the file handle of watched objects.
+ */
+ if (!nop)
+ return -EOPNOTSUPP;
+
/*
- * We need to make sure that the file system supports at least
- * encoding a file handle so user can use name_to_handle_at() to
- * compare fid returned with event to the file handle of watched
- * objects. However, even the relaxed AT_HANDLE_FID flag requires
- * at least empty export_operations for ecoding unique file ids.
+ * For sb/mount mark, we also need to make sure that the filesystem
+ * supports decoding file handles, so user has a way to map back the
+ * reported fids to filesystem objects.
*/
- if (!dentry->d_sb->s_export_op)
+ if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
return -EOPNOTSUPP;
return 0;
@@ -1812,7 +1821,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto path_put_and_out;
- ret = fanotify_test_fid(path.dentry);
+ ret = fanotify_test_fid(path.dentry, flags);
if (ret)
goto path_put_and_out;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 647a22433bd8..9a4b228d42fa 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -84,7 +84,7 @@ slow:
return -ENOMEM;
}
inode->i_ino = ns->inum;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_flags |= S_IMMUTABLE;
inode->i_mode = S_IFREG | S_IRUGO;
inode->i_fop = &ns_file_operations;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 99ac6ea277c4..aba1e22db4e9 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -648,7 +648,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
* mtime is the last change of the data within the file. Not changed
* when only metadata is changed, e.g. a rename doesn't affect mtime.
*/
- vi->i_mtime = ntfs2utc(si->last_data_change_time);
+ inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
/*
* ctime is the last change of the metadata of the file. This obviously
* always changes, when mtime is changed. ctime can be changed on its
@@ -659,7 +659,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
* Last access to the data within the file. Not changed during a rename
* for example but changed whenever the file is written to.
*/
- vi->i_atime = ntfs2utc(si->last_access_time);
+ inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
/* Find the attribute list attribute if present. */
ntfs_attr_reinit_search_ctx(ctx);
@@ -1217,9 +1217,9 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
- vi->i_mtime = base_vi->i_mtime;
+ inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- vi->i_atime = base_vi->i_atime;
+ inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
vi->i_generation = ni->seq_no = base_ni->seq_no;
/* Set inode type to zero but preserve permissions. */
@@ -1483,9 +1483,9 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
vi->i_uid = base_vi->i_uid;
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
- vi->i_mtime = base_vi->i_mtime;
+ inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
- vi->i_atime = base_vi->i_atime;
+ inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
vi->i_generation = ni->seq_no = base_ni->seq_no;
/* Set inode type to zero but preserve permissions. */
vi->i_mode = base_vi->i_mode & ~S_IFMT;
@@ -2805,13 +2805,14 @@ done:
if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
struct timespec64 now = current_time(VFS_I(base_ni));
struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
+ struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
int sync_it = 0;
- if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+ if (!timespec64_equal(&mtime, &now) ||
!timespec64_equal(&ctime, &now))
sync_it = 1;
inode_set_ctime_to_ts(VFS_I(base_ni), now);
- VFS_I(base_ni)->i_mtime = now;
+ inode_set_mtime_to_ts(VFS_I(base_ni), now);
if (sync_it)
mark_inode_dirty_sync(VFS_I(base_ni));
@@ -2925,9 +2926,9 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
}
if (ia_valid & ATTR_ATIME)
- vi->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(vi, attr->ia_atime);
if (ia_valid & ATTR_MTIME)
- vi->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(vi, attr->ia_mtime);
if (ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(vi, attr->ia_ctime);
mark_inode_dirty(vi);
@@ -2996,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
le16_to_cpu(ctx->attr->data.resident.value_offset));
/* Update the access times if they have changed. */
- nt = utc2ntfs(vi->i_mtime);
+ nt = utc2ntfs(inode_get_mtime(vi));
if (si->last_data_change_time != nt) {
ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
@@ -3014,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_mft_change_time = nt;
modified = true;
}
- nt = utc2ntfs(vi->i_atime);
+ nt = utc2ntfs(inode_get_atime(vi));
if (si->last_access_time != nt) {
ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino,
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index ad1a8f72da22..6fd1dc4b08c8 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2682,7 +2682,7 @@ mft_rec_already_initialized:
vi->i_mode &= ~S_IWUGO;
/* Set the inode times to the current time. */
- vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi);
+ simple_inode_init_ts(vi);
/*
* Set the file size to 0, the ntfs inode sizes are set to 0 by
* the call to ntfs_init_big_inode() below.
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index a9d82bbb4729..63f70259edc0 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -1106,10 +1106,10 @@ repack:
}
}
- /*
+ /*
* The code below may require additional cluster (to extend attribute list)
- * and / or one MFT record
- * It is too complex to undo operations if -ENOSPC occurs deep inside
+ * and / or one MFT record
+ * It is too complex to undo operations if -ENOSPC occurs deep inside
* in 'ni_insert_nonresident'.
* Return in advance -ENOSPC here if there are no free cluster and no free MFT.
*/
@@ -1736,10 +1736,8 @@ repack:
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
0, NULL, &mi_b);
- if (!attr_b) {
- err = -ENOENT;
- goto out;
- }
+ if (!attr_b)
+ return -ENOENT;
attr = attr_b;
le = le_b;
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index 42631b31adf1..7c01735d1219 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -52,7 +52,8 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
if (!attr->non_res) {
lsize = le32_to_cpu(attr->res.data_size);
- le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN);
+ /* attr is resident: lsize < record_size (1K or 4K) */
+ le = kvmalloc(al_aligned(lsize), GFP_KERNEL);
if (!le) {
err = -ENOMEM;
goto out;
@@ -80,7 +81,17 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
if (err < 0)
goto out;
- le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN);
+ /* attr is nonresident.
+ * The worst case:
+ * 1T (2^40) extremely fragmented file.
+ * cluster = 4K (2^12) => 2^28 fragments
+ * 2^9 fragments per one record => 2^19 records
+ * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes.
+ *
+ * the result is 16M bytes per attribute list.
+ * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes]
+ */
+ le = kvmalloc(al_aligned(lsize), GFP_KERNEL);
if (!le) {
err = -ENOMEM;
goto out;
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 107e808e06ea..63f14a0232f6 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -125,6 +125,7 @@ void wnd_close(struct wnd_bitmap *wnd)
struct rb_node *node, *next;
kfree(wnd->free_bits);
+ wnd->free_bits = NULL;
run_close(&wnd->run);
node = rb_first(&wnd->start_tree);
@@ -659,7 +660,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits)
wnd->bits_last = wbits;
wnd->free_bits =
- kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
+ kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO);
+
if (!wnd->free_bits)
return -ENOMEM;
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index 063a6654199b..ec0566b322d5 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -309,7 +309,11 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
}
- dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+ /* NTFS: symlinks are "dir + reparse" or "file + reparse" */
+ if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
+ dt_type = DT_LNK;
+ else
+ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
}
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 962f12ce6c0a..ad4a70b5d432 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -342,7 +342,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
err = 0;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
if (IS_SYNC(inode)) {
@@ -400,7 +400,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
ni_unlock(ni);
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (!IS_DIRSYNC(inode)) {
dirty = 1;
} else {
@@ -642,7 +642,7 @@ out:
filemap_invalidate_unlock(mapping);
if (!err) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
}
@@ -745,8 +745,8 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len, unsigned int flags)
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
struct inode *inode = in->f_mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 2b85cb10f0be..3df2d9e34b91 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -2148,7 +2148,7 @@ out1:
for (i = 0; i < pages_per_frame; i++) {
pg = pages[i];
- if (i == idx)
+ if (i == idx || !pg)
continue;
unlock_page(pg);
put_page(pg);
@@ -3208,6 +3208,12 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup)))
continue;
+ /* Check simple case when parent inode equals current inode. */
+ if (ino_get(&fname->home) == ni->vfs_inode.i_ino) {
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ continue;
+ }
+
/* ntfs_iget5 may sleep. */
dir = ntfs_iget5(sb, &fname->home, NULL);
if (IS_ERR(dir)) {
@@ -3265,7 +3271,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_rec_inuse(ni->mi.mrec) &&
!(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
bool modified = false;
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts;
/* Update times in standard attribute. */
std = ni_std(ni);
@@ -3275,19 +3281,22 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
}
/* Update the access times if they have changed. */
- dup.m_time = kernel2nt(&inode->i_mtime);
+ ts = inode_get_mtime(inode);
+ dup.m_time = kernel2nt(&ts);
if (std->m_time != dup.m_time) {
std->m_time = dup.m_time;
modified = true;
}
- dup.c_time = kernel2nt(&ctime);
+ ts = inode_get_mtime(inode);
+ dup.c_time = kernel2nt(&ts);
if (std->c_time != dup.c_time) {
std->c_time = dup.c_time;
modified = true;
}
- dup.a_time = kernel2nt(&inode->i_atime);
+ ts = inode_get_atime(inode);
+ dup.a_time = kernel2nt(&ts);
if (std->a_time != dup.a_time) {
std->a_time = dup.a_time;
modified = true;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 12f28cdf5c83..98ccb6650858 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -2168,8 +2168,10 @@ file_is_valid:
if (!page) {
page = kmalloc(log->page_size, GFP_NOFS);
- if (!page)
- return -ENOMEM;
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
}
/*
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 33afee0f5559..fbfe21dbb425 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -983,18 +983,11 @@ out:
if (err)
return err;
- mark_inode_dirty(&ni->vfs_inode);
+ mark_inode_dirty_sync(&ni->vfs_inode);
/* verify(!ntfs_update_mftmirr()); */
- /*
- * If we used wait=1, sync_inode_metadata waits for the io for the
- * inode to finish. It hangs when media is removed.
- * So wait=0 is sent down to sync_inode_metadata
- * and filemap_fdatawrite is used for the data blocks.
- */
- err = sync_inode_metadata(&ni->vfs_inode, 0);
- if (!err)
- err = filemap_fdatawrite(ni->vfs_inode.i_mapping);
+ /* write mft record on disk. */
+ err = _ni_write_inode(&ni->vfs_inode, 1);
return err;
}
@@ -2461,10 +2454,12 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
{
CLST end, i, zone_len, zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
+ bool dirty = false;
down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
if (!wnd_is_used(wnd, lcn, len)) {
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ /* mark volume as dirty out of wnd->rw_lock */
+ dirty = true;
end = lcn + len;
len = 0;
@@ -2518,6 +2513,8 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
out:
up_write(&wnd->rw_lock);
+ if (dirty)
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}
/*
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 124c6e822623..cf92b2433f7a 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -729,6 +729,9 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
u32 total = le32_to_cpu(hdr->total);
u16 offs[128];
+ if (unlikely(!cmp))
+ return NULL;
+
fill_table:
if (end > total)
return NULL;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index eb2ed0701495..5e3d71374918 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -44,7 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
u64 t64;
struct MFT_REC *rec;
struct runs_tree *run;
- struct timespec64 ctime;
+ struct timespec64 ts;
inode->i_op = NULL;
/* Setup 'uid' and 'gid' */
@@ -169,10 +169,12 @@ next_attr:
#ifdef STATX_BTIME
nt2kernel(std5->cr_time, &ni->i_crtime);
#endif
- nt2kernel(std5->a_time, &inode->i_atime);
- ctime = inode_get_ctime(inode);
- nt2kernel(std5->c_time, &ctime);
- nt2kernel(std5->m_time, &inode->i_mtime);
+ nt2kernel(std5->a_time, &ts);
+ inode_set_atime_to_ts(inode, ts);
+ nt2kernel(std5->c_time, &ts);
+ inode_set_ctime_to_ts(inode, ts);
+ nt2kernel(std5->m_time, &ts);
+ inode_set_mtime_to_ts(inode, ts);
ni->std_fa = std5->fa;
@@ -960,7 +962,8 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
if (err >= 0) {
if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
dirty = true;
}
@@ -1660,8 +1663,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
/* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
- inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, ni->i_crtime);
- dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime);
+ inode_set_atime_to_ts(inode, ni->i_crtime);
+ inode_set_ctime_to_ts(inode, ni->i_crtime);
+ inode_set_mtime_to_ts(inode, ni->i_crtime);
+ inode_set_mtime_to_ts(dir, ni->i_crtime);
+ inode_set_ctime_to_ts(dir, ni->i_crtime);
mark_inode_dirty(dir);
mark_inode_dirty(inode);
@@ -1767,7 +1773,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
if (!err) {
drop_nlink(inode);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
if (inode->i_nlink)
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index ad430d50bd79..ee3093be5170 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
err = ntfs_link_inode(inode, de);
if (!err) {
- dir->i_mtime = inode_set_ctime_to_ts(inode,
- inode_set_ctime_current(dir));
+ inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(inode);
mark_inode_dirty(dir);
d_instantiate(de, inode);
@@ -373,7 +373,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
if (IS_POSIXACL(dir)) {
- /*
+ /*
* Load in cache current acl to avoid ni_lock(dir):
* ntfs_create_inode -> ntfs_init_acl -> posix_acl_create ->
* ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 98b76d1b09e7..86aecbb01a92 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -847,7 +847,7 @@ struct OBJECT_ID {
// Birth Volume Id is the Object Id of the Volume on.
// which the Object Id was allocated. It never changes.
struct GUID BirthVolumeId; //0x10:
-
+
// Birth Object Id is the first Object Id that was
// ever assigned to this MFT Record. I.e. If the Object Id
// is changed for some reason, this field will reflect the
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 629403ede6e5..f6706143d14b 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -42,9 +42,11 @@ enum utf16_endian;
#define MINUS_ONE_T ((size_t)(-1))
/* Biggest MFT / smallest cluster */
#define MAXIMUM_BYTES_PER_MFT 4096
+#define MAXIMUM_SHIFT_BYTES_PER_MFT 12
#define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512)
#define MAXIMUM_BYTES_PER_INDEX 4096
+#define MAXIMUM_SHIFT_BYTES_PER_INDEX 12
#define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512)
/* NTFS specific error code when fixup failed. */
@@ -495,8 +497,6 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, u32 flags);
int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len);
int ntfs_file_open(struct inode *inode, struct file *file);
int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
@@ -872,7 +872,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern const struct xattr_handler *ntfs_xattr_handlers[];
+extern const struct xattr_handler * const ntfs_xattr_handlers[];
int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index c12ebffc94da..53629b1f65e9 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -189,12 +189,19 @@ out:
return err;
}
+/*
+ * mi_enum_attr - start/continue attributes enumeration in record.
+ *
+ * NOTE: mi->mrec - memory of size sbi->record_size
+ * here we sure that mi->mrec->total == sbi->record_size (see mi_read)
+ */
struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
{
const struct MFT_REC *rec = mi->mrec;
u32 used = le32_to_cpu(rec->used);
- u32 t32, off, asize;
+ u32 t32, off, asize, prev_type;
u16 t16;
+ u64 data_size, alloc_size, tot_size;
if (!attr) {
u32 total = le32_to_cpu(rec->total);
@@ -213,6 +220,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
if (!is_rec_inuse(rec))
return NULL;
+ prev_type = 0;
attr = Add2Ptr(rec, off);
} else {
/* Check if input attr inside record. */
@@ -226,11 +234,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
return NULL;
}
- if (off + asize < off) {
- /* Overflow check. */
+ /* Overflow check. */
+ if (off + asize < off)
return NULL;
- }
+ prev_type = le32_to_cpu(attr->type);
attr = Add2Ptr(attr, asize);
off += asize;
}
@@ -250,7 +258,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
/* 0x100 is last known attribute for now. */
t32 = le32_to_cpu(attr->type);
- if ((t32 & 0xf) || (t32 > 0x100))
+ if (!t32 || (t32 & 0xf) || (t32 > 0x100))
+ return NULL;
+
+ /* attributes in record must be ordered by type */
+ if (t32 < prev_type)
return NULL;
/* Check overflow and boundary. */
@@ -259,16 +271,15 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
/* Check size of attribute. */
if (!attr->non_res) {
+ /* Check resident fields. */
if (asize < SIZEOF_RESIDENT)
return NULL;
t16 = le16_to_cpu(attr->res.data_off);
-
if (t16 > asize)
return NULL;
- t32 = le32_to_cpu(attr->res.data_size);
- if (t16 + t32 > asize)
+ if (t16 + le32_to_cpu(attr->res.data_size) > asize)
return NULL;
t32 = sizeof(short) * attr->name_len;
@@ -278,21 +289,52 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
return attr;
}
- /* Check some nonresident fields. */
- if (attr->name_len &&
- le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len >
- le16_to_cpu(attr->nres.run_off)) {
+ /* Check nonresident fields. */
+ if (attr->non_res != 1)
+ return NULL;
+
+ t16 = le16_to_cpu(attr->nres.run_off);
+ if (t16 > asize)
+ return NULL;
+
+ t32 = sizeof(short) * attr->name_len;
+ if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
+ return NULL;
+
+ /* Check start/end vcn. */
+ if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1)
return NULL;
- }
- if (attr->nres.svcn || !is_attr_ext(attr)) {
+ data_size = le64_to_cpu(attr->nres.data_size);
+ if (le64_to_cpu(attr->nres.valid_size) > data_size)
+ return NULL;
+
+ alloc_size = le64_to_cpu(attr->nres.alloc_size);
+ if (data_size > alloc_size)
+ return NULL;
+
+ t32 = mi->sbi->cluster_mask;
+ if (alloc_size & t32)
+ return NULL;
+
+ if (!attr->nres.svcn && is_attr_ext(attr)) {
+ /* First segment of sparse/compressed attribute */
+ if (asize + 8 < SIZEOF_NONRESIDENT_EX)
+ return NULL;
+
+ tot_size = le64_to_cpu(attr->nres.total_size);
+ if (tot_size & t32)
+ return NULL;
+
+ if (tot_size > alloc_size)
+ return NULL;
+ } else {
if (asize + 8 < SIZEOF_NONRESIDENT)
return NULL;
if (attr->nres.c_unit)
return NULL;
- } else if (asize + 8 < SIZEOF_NONRESIDENT_EX)
- return NULL;
+ }
return attr;
}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index cfec5e0c7f66..f763e3256ccc 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -453,15 +453,23 @@ static struct proc_dir_entry *proc_info_root;
* ntfs3.1
* cluster size
* number of clusters
+ * total number of mft records
+ * number of used mft records ~= number of files + folders
+ * real state of ntfs "dirty"/"clean"
+ * current state of ntfs "dirty"/"clean"
*/
static int ntfs3_volinfo(struct seq_file *m, void *o)
{
struct super_block *sb = m->private;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- seq_printf(m, "ntfs%d.%d\n%u\n%zu\n", sbi->volume.major_ver,
- sbi->volume.minor_ver, sbi->cluster_size,
- sbi->used.bitmap.nbits);
+ seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n",
+ sbi->volume.major_ver, sbi->volume.minor_ver,
+ sbi->cluster_size, sbi->used.bitmap.nbits,
+ sbi->mft.bitmap.nbits,
+ sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap),
+ sbi->volume.real_dirty ? "dirty" : "clean",
+ (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean");
return 0;
}
@@ -488,9 +496,13 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer,
{
int err;
struct super_block *sb = pde_data(file_inode(file));
- struct ntfs_sb_info *sbi = sb->s_fs_info;
ssize_t ret = count;
- u8 *label = kmalloc(count, GFP_NOFS);
+ u8 *label;
+
+ if (sb_rdonly(sb))
+ return -EROFS;
+
+ label = kmalloc(count, GFP_NOFS);
if (!label)
return -ENOMEM;
@@ -502,7 +514,7 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer,
while (ret > 0 && label[ret - 1] == '\n')
ret -= 1;
- err = ntfs_set_label(sbi, label, ret);
+ err = ntfs_set_label(sb->s_fs_info, label, ret);
if (err < 0) {
ntfs_err(sb, "failed (%d) to write label", err);
@@ -576,20 +588,30 @@ static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi)
wnd_close(&sbi->mft.bitmap);
wnd_close(&sbi->used.bitmap);
- if (sbi->mft.ni)
+ if (sbi->mft.ni) {
iput(&sbi->mft.ni->vfs_inode);
+ sbi->mft.ni = NULL;
+ }
- if (sbi->security.ni)
+ if (sbi->security.ni) {
iput(&sbi->security.ni->vfs_inode);
+ sbi->security.ni = NULL;
+ }
- if (sbi->reparse.ni)
+ if (sbi->reparse.ni) {
iput(&sbi->reparse.ni->vfs_inode);
+ sbi->reparse.ni = NULL;
+ }
- if (sbi->objid.ni)
+ if (sbi->objid.ni) {
iput(&sbi->objid.ni->vfs_inode);
+ sbi->objid.ni = NULL;
+ }
- if (sbi->volume.ni)
+ if (sbi->volume.ni) {
iput(&sbi->volume.ni->vfs_inode);
+ sbi->volume.ni = NULL;
+ }
ntfs_update_mftmirr(sbi, 0);
@@ -836,7 +858,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
struct ntfs_sb_info *sbi = sb->s_fs_info;
int err;
u32 mb, gb, boot_sector_size, sct_per_clst, record_size;
- u64 sectors, clusters, mlcn, mlcn2;
+ u64 sectors, clusters, mlcn, mlcn2, dev_size0;
struct NTFS_BOOT *boot;
struct buffer_head *bh;
struct MFT_REC *rec;
@@ -845,6 +867,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
u32 boot_off = 0;
const char *hint = "Primary boot";
+ /* Save original dev_size. Used with alternative boot. */
+ dev_size0 = dev_size;
+
sbi->volume.blocks = dev_size >> PAGE_SHIFT;
bh = ntfs_bread(sb, 0);
@@ -853,6 +878,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
check_boot:
err = -EINVAL;
+
+ /* Corrupted image; do not read OOB */
+ if (bh->b_size - sizeof(*boot) < boot_off)
+ goto out;
+
boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off);
if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) {
@@ -899,9 +929,17 @@ check_boot:
goto out;
}
- sbi->record_size = record_size =
- boot->record_size < 0 ? 1 << (-boot->record_size) :
- (u32)boot->record_size << cluster_bits;
+ if (boot->record_size >= 0) {
+ record_size = (u32)boot->record_size << cluster_bits;
+ } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) {
+ record_size = 1u << (-boot->record_size);
+ } else {
+ ntfs_err(sb, "%s: invalid record size %d.", hint,
+ boot->record_size);
+ goto out;
+ }
+
+ sbi->record_size = record_size;
sbi->record_bits = blksize_bits(record_size);
sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
@@ -918,9 +956,15 @@ check_boot:
goto out;
}
- sbi->index_size = boot->index_size < 0 ?
- 1u << (-boot->index_size) :
- (u32)boot->index_size << cluster_bits;
+ if (boot->index_size >= 0) {
+ sbi->index_size = (u32)boot->index_size << cluster_bits;
+ } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) {
+ sbi->index_size = 1u << (-boot->index_size);
+ } else {
+ ntfs_err(sb, "%s: invalid index size %d.", hint,
+ boot->index_size);
+ goto out;
+ }
/* Check index record size. */
if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) {
@@ -1055,17 +1099,17 @@ check_boot:
if (bh->b_blocknr && !sb_rdonly(sb)) {
/*
- * Alternative boot is ok but primary is not ok.
- * Do not update primary boot here 'cause it may be faked boot.
- * Let ntfs to be mounted and update boot later.
- */
+ * Alternative boot is ok but primary is not ok.
+ * Do not update primary boot here 'cause it may be faked boot.
+ * Let ntfs to be mounted and update boot later.
+ */
*boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN);
}
out:
- if (err == -EINVAL && !bh->b_blocknr && dev_size > PAGE_SHIFT) {
+ if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) {
u32 block_size = min_t(u32, sector_size, PAGE_SIZE);
- u64 lbo = dev_size - sizeof(*boot);
+ u64 lbo = dev_size0 - sizeof(*boot);
/*
* Try alternative boot (last sector)
@@ -1079,6 +1123,7 @@ out:
boot_off = lbo & (block_size - 1);
hint = "Alternative boot";
+ dev_size = dev_size0; /* restore original size. */
goto check_boot;
}
brelse(bh);
@@ -1367,7 +1412,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
}
bytes = inode->i_size;
- sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN);
+ sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL);
if (!t) {
err = -ENOMEM;
goto put_inode_out;
@@ -1521,9 +1566,9 @@ load_root:
if (boot2) {
/*
- * Alternative boot is ok but primary is not ok.
- * Volume is recognized as NTFS. Update primary boot.
- */
+ * Alternative boot is ok but primary is not ok.
+ * Volume is recognized as NTFS. Update primary boot.
+ */
struct buffer_head *bh0 = sb_getblk(sb, 0);
if (bh0) {
if (buffer_locked(bh0))
@@ -1562,7 +1607,9 @@ load_root:
put_inode_out:
iput(inode);
out:
+ ntfs3_put_sbi(sbi);
kfree(boot2);
+ ntfs3_put_sbi(sbi);
return err;
}
@@ -1756,7 +1803,6 @@ static int __init init_ntfs_fs(void)
if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
pr_info("ntfs3: Read-only LZX/Xpress compression included\n");
-
#ifdef CONFIG_PROC_FS
/* Create "/proc/fs/ntfs3" */
proc_info_root = proc_mkdir("fs/ntfs3", NULL);
@@ -1798,7 +1844,6 @@ static void __exit exit_ntfs_fs(void)
if (proc_info_root)
remove_proc_entry("fs/ntfs3", NULL);
#endif
-
}
MODULE_LICENSE("GPL");
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 29fd391899e5..4274b6f31cfa 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -211,7 +211,8 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
size = le32_to_cpu(info->size);
/* Enumerate all xattrs. */
- for (ret = 0, off = 0; off < size; off += ea_size) {
+ ret = 0;
+ for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) {
ea = Add2Ptr(ea_all, off);
ea_size = unpacked_ea_size(ea);
@@ -219,6 +220,10 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
break;
if (buffer) {
+ /* Check if we can use field ea->name */
+ if (off + ea_size > size)
+ break;
+
if (ret + ea->name_len + 1 > bytes_per_buffer) {
err = -ERANGE;
goto out;
@@ -1016,7 +1021,7 @@ static const struct xattr_handler ntfs_other_xattr_handler = {
.list = ntfs_xattr_user_list,
};
-const struct xattr_handler *ntfs_xattr_handlers[] = {
+const struct xattr_handler * const ntfs_xattr_handlers[] = {
&ntfs_other_xattr_handler,
NULL,
};
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e75137a8e7cb..62464d194da3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -193,8 +193,8 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
inode->i_mode = new_mode;
inode_set_ctime_current(inode);
di->i_mode = cpu_to_le16(inode->i_mode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index aef58f1395c8..f0937902f7b4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7436,10 +7436,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0fdba30740ab..6ab03494fc6e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2048,9 +2048,9 @@ out_write_size:
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode_set_ctime_current(inode);
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
if (handle)
ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21472e3ed182..4d7efefa98c5 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
unsigned int hr_num_pages;
struct page **hr_slot_data;
- struct block_device *hr_bdev;
+ struct bdev_handle *hr_bdev_handle;
struct o2hb_disk_slot *hr_slots;
/* live node map of this region */
@@ -261,6 +261,11 @@ struct o2hb_region {
int hr_last_hb_status;
};
+static inline struct block_device *reg_bdev(struct o2hb_region *reg)
+{
+ return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+}
+
struct o2hb_bio_wait_ctxt {
atomic_t wc_num_reqs;
struct completion wc_io_complete;
@@ -286,7 +291,7 @@ static void o2hb_write_timeout(struct work_struct *work)
hr_write_timeout_work.work);
mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u "
- "milliseconds\n", reg->hr_bdev,
+ "milliseconds\n", reg_bdev(reg),
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
@@ -383,7 +388,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n",
o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
set_bit(master_node, reg->hr_nego_node_bitmap);
}
if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
@@ -398,7 +403,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
}
printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item),
+ reg_bdev(reg));
/* approve negotiate timeout request. */
o2hb_arm_timeout(reg);
@@ -419,7 +425,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
/* negotiate timeout with master node. */
printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n",
o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
- reg->hr_bdev, master_node);
+ reg_bdev(reg), master_node);
ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
master_node);
if (ret)
@@ -436,7 +442,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
nego_msg = (struct o2hb_nego_msg *)msg->buf;
printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n",
- nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_bdev);
+ nego_msg->node_num, config_item_name(&reg->hr_item),
+ reg_bdev(reg));
if (nego_msg->node_num < O2NM_MAX_NODES)
set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
else
@@ -451,7 +458,7 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
struct o2hb_region *reg = data;
printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
o2hb_arm_timeout(reg);
return 0;
}
@@ -515,7 +522,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC);
+ bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -687,7 +694,7 @@ static int o2hb_check_own_slot(struct o2hb_region *reg)
errstr = ERRSTR3;
mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), "
- "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev,
+ "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg),
slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
@@ -861,7 +868,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
goto unlock;
printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -920,7 +927,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
* consider it a transient miss but don't populate any
* other values as they may be junk. */
mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n",
- slot->ds_node_num, reg->hr_bdev);
+ slot->ds_node_num, reg_bdev(reg));
o2hb_dump_slot(hb_block);
slot->ds_equal_samples++;
@@ -1003,8 +1010,8 @@ fire_callbacks:
"of %u ms, but our count is %u ms.\n"
"Please double check your configuration values "
"for 'O2CB_HEARTBEAT_THRESHOLD'\n",
- slot->ds_node_num, reg->hr_bdev, slot_dead_ms,
- dead_ms);
+ slot->ds_node_num, reg_bdev(reg),
+ slot_dead_ms, dead_ms);
}
goto out;
}
@@ -1143,7 +1150,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* can't be sure that the new block ever made it to
* disk */
mlog(ML_ERROR, "Write error %d on device \"%pg\"\n",
- write_wc.wc_error, reg->hr_bdev);
+ write_wc.wc_error, reg_bdev(reg));
ret = write_wc.wc_error;
goto bail;
}
@@ -1169,7 +1176,7 @@ bail:
printk(KERN_NOTICE "o2hb: Unable to stabilize "
"heartbeat on region %s (%pg)\n",
config_item_name(&reg->hr_item),
- reg->hr_bdev);
+ reg_bdev(reg));
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
@@ -1489,7 +1496,7 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
- mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev);
+ mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg));
kfree(reg->hr_tmp_block);
@@ -1502,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slot_data);
}
- if (reg->hr_bdev)
- blkdev_put(reg->hr_bdev, NULL);
+ if (reg->hr_bdev_handle)
+ bdev_release(reg->hr_bdev_handle);
kfree(reg->hr_slots);
@@ -1562,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
unsigned long block_bytes;
unsigned int block_bits;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
return -EINVAL;
status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1591,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
char *p = (char *)page;
ssize_t ret;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
return -EINVAL;
ret = kstrtoull(p, 0, &tmp);
@@ -1616,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
unsigned long tmp;
char *p = (char *)page;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
return -EINVAL;
tmp = simple_strtoul(p, &p, 0);
@@ -1635,8 +1642,8 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (to_o2hb_region(item)->hr_bdev)
- ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev);
+ if (to_o2hb_region(item)->hr_bdev_handle)
+ ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
return ret;
}
@@ -1745,7 +1752,10 @@ out:
return ret;
}
-/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
+/*
+ * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * nothing
+ */
static ssize_t o2hb_region_dev_store(struct config_item *item,
const char *page,
size_t count)
@@ -1759,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
ssize_t ret = -EINVAL;
int live_threshold;
- if (reg->hr_bdev)
+ if (reg->hr_bdev_handle)
goto out;
/* We can't heartbeat without having had our node number
@@ -1785,16 +1795,15 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
if (!S_ISBLK(f.file->f_mapping->host->i_mode))
goto out2;
- reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev,
- BLK_OPEN_WRITE | BLK_OPEN_READ, NULL,
- NULL);
- if (IS_ERR(reg->hr_bdev)) {
- ret = PTR_ERR(reg->hr_bdev);
- reg->hr_bdev = NULL;
+ reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+ BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
+ if (IS_ERR(reg->hr_bdev_handle)) {
+ ret = PTR_ERR(reg->hr_bdev_handle);
+ reg->hr_bdev_handle = NULL;
goto out2;
}
- sectsize = bdev_logical_block_size(reg->hr_bdev);
+ sectsize = bdev_logical_block_size(reg_bdev(reg));
if (sectsize != reg->hr_block_bytes) {
mlog(ML_ERROR,
"blocksize %u incorrect for device, expected %d",
@@ -1890,12 +1899,12 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
if (hb_task && o2hb_global_heartbeat_active())
printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n",
- config_item_name(&reg->hr_item), reg->hr_bdev);
+ config_item_name(&reg->hr_item), reg_bdev(reg));
out3:
if (ret < 0) {
- blkdev_put(reg->hr_bdev, NULL);
- reg->hr_bdev = NULL;
+ bdev_release(reg->hr_bdev_handle);
+ reg->hr_bdev_handle = NULL;
}
out2:
fdput(f);
@@ -2085,7 +2094,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n",
((atomic_read(&reg->hr_steady_iterations) == 0) ?
"stopped" : "start aborted"), config_item_name(item),
- reg->hr_bdev);
+ reg_bdev(reg));
}
/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8b123d543e6e..a14c8fee6ee5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1658,7 +1658,8 @@ int __ocfs2_add_entry(handle_t *handle,
offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
if (ocfs2_dirent_would_fit(de, rec_len)) {
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_current(dir));
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (retval < 0) {
mlog_errno(retval);
@@ -2962,11 +2963,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dinode_new_extent_list(dir, di);
i_size_write(dir, sb->s_blocksize);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
di->i_size = cpu_to_le64(sb->s_blocksize);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir));
ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 81265123ce6c..9b57d012fd5c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -337,7 +337,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
@@ -360,7 +360,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c3e2961ee5db..64a6ef638495 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2162,7 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 ts;
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2183,12 +2183,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
lvb->lvb_imode = cpu_to_be16(inode->i_mode);
lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
- lvb->lvb_iatime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
- lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&ctime));
- lvb->lvb_imtime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+ ts = inode_get_atime(inode);
+ lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_ctime(inode);
+ lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
+ ts = inode_get_mtime(inode);
+ lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
@@ -2209,7 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
- struct timespec64 ctime;
+ struct timespec64 ts;
mlog_meta_lvb(0, lockres);
@@ -2236,13 +2236,12 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
inode->i_mode = be16_to_cpu(lvb->lvb_imode);
set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
- ocfs2_unpack_timespec(&inode->i_atime,
- be64_to_cpu(lvb->lvb_iatime_packed));
- ocfs2_unpack_timespec(&inode->i_mtime,
- be64_to_cpu(lvb->lvb_imtime_packed));
- ocfs2_unpack_timespec(&ctime,
- be64_to_cpu(lvb->lvb_ictime_packed));
- inode_set_ctime_to_ts(inode, ctime);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed));
+ inode_set_atime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed));
+ inode_set_mtime_to_ts(inode, ts);
+ ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed));
+ inode_set_ctime_to_ts(inode, ts);
spin_unlock(&oi->ip_lock);
return 0;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c45596c25c66..94e2a1244442 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -233,16 +233,18 @@ int ocfs2_should_update_atime(struct inode *inode,
if (vfsmnt->mnt_flags & MNT_RELATIME) {
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 atime = inode_get_atime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
- if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
- (timespec64_compare(&inode->i_atime, &ctime) <= 0))
+ if ((timespec64_compare(&atime, &mtime) <= 0) ||
+ (timespec64_compare(&atime, &ctime) <= 0))
return 1;
return 0;
}
now = current_time(inode);
- if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
+ if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum))
return 0;
else
return 1;
@@ -275,9 +277,9 @@ int ocfs2_update_inode_atime(struct inode *inode,
* have i_rwsem to guard against concurrent changes to other
* inode fields.
*/
- inode->i_atime = current_time(inode);
- di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ inode_set_atime_to_ts(inode, current_time(inode));
+ di->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, bh);
@@ -296,7 +298,7 @@ int ocfs2_set_inode_size(handle_t *handle,
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -417,12 +419,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
}
i_size_write(inode, new_i_size);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
@@ -821,9 +823,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
i_size_write(inode, abs_to);
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode_set_ctime_current(inode);
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
di->i_mtime_nsec = di->i_ctime_nsec;
if (handle) {
ocfs2_journal_dirty(handle, di_bh);
@@ -2040,7 +2042,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index e8771600b930..999111bfc271 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -302,10 +302,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_blocks = ocfs2_inode_sector_count(inode);
inode->i_mapping->a_ops = &ocfs2_aops;
}
- inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
- inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
- inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
- inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+ inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+ le32_to_cpu(fe->i_atime_nsec));
+ inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+ le32_to_cpu(fe->i_mtime_nsec));
inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
le32_to_cpu(fe->i_ctime_nsec));
@@ -1312,12 +1312,12 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_uid = cpu_to_le32(i_uid_read(inode));
fe->i_gid = cpu_to_le32(i_gid_read(inode));
fe->i_mode = cpu_to_le16(inode->i_mode);
- fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode));
+ fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode));
+ fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
ocfs2_journal_dirty(handle, bh);
ocfs2_update_inode_fsync_trans(handle, inode, 1);
@@ -1348,10 +1348,10 @@ void ocfs2_refresh_inode(struct inode *inode,
inode->i_blocks = 0;
else
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
- inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
- inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
- inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
+ inode_set_atime(inode, le64_to_cpu(fe->i_atime),
+ le32_to_cpu(fe->i_atime_nsec));
+ inode_set_mtime(inode, le64_to_cpu(fe->i_mtime),
+ le32_to_cpu(fe->i_mtime_nsec));
inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
le32_to_cpu(fe->i_ctime_nsec));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 05d67968a3a9..1f9ed117e78b 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -951,8 +951,8 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
di = (struct ocfs2_dinode *)di_bh->b_data;
inode_set_ctime_current(inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5cd6d7771cea..681e9501cdd3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -795,8 +795,8 @@ static int ocfs2_link(struct dentry *old_dentry,
inc_nlink(inode);
inode_set_ctime_current(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
@@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir,
ocfs2_set_links_count(fe, inode->i_nlink);
ocfs2_journal_dirty(handle, fe_bh);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (S_ISDIR(inode->i_mode))
drop_nlink(dir);
@@ -1550,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
- old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
- old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
+ old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
+ old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1592,7 +1592,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
drop_nlink(new_inode);
inode_set_ctime_current(new_inode);
}
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
@@ -1614,8 +1614,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (old_dir != new_dir) {
/* Keep the same times on both directories.*/
- new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
- inode_get_ctime(old_dir));
+ inode_set_mtime_to_ts(new_dir,
+ inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
/*
* This will also pick up the i_nlink change from the
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 25c8ec3c8c3a..3f80a56d0d60 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3751,8 +3751,8 @@ static int ocfs2_change_ctime(struct inode *inode,
}
inode_set_ctime_current(inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(handle, di_bh);
@@ -4075,10 +4075,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
*/
inode_set_ctime_current(t_inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode));
- t_inode->i_mtime = s_inode->i_mtime;
+ inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode));
di->i_mtime = s_di->i_mtime;
di->i_mtime_nsec = s_di->i_mtime_nsec;
}
@@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest,
if (newlen > i_size_read(dest))
i_size_write(dest, newlen);
spin_unlock(&OCFS2_I(dest)->ip_lock);
- dest->i_mtime = inode_set_ctime_current(dest);
+ inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest));
ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
if (ret) {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6510ad783c91..3b81213ed7b8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -87,14 +87,14 @@ static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
};
-const struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler * const ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
&ocfs2_xattr_trusted_handler,
&ocfs2_xattr_security_handler,
NULL
};
-static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access,
[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
@@ -3422,8 +3422,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
}
inode_set_ctime_current(inode);
- di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
}
out:
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f..65e9aa743919 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,7 +30,7 @@ struct ocfs2_security_xattr_info {
extern const struct xattr_handler ocfs2_xattr_user_handler;
extern const struct xattr_handler ocfs2_xattr_trusted_handler;
extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler * const ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 2f8c1882f45c..d6cd81163030 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_mapping->a_ops = &omfs_aops;
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_op = &omfs_dir_inops;
@@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait)
oi->i_head.h_magic = OMFS_IMAGIC;
oi->i_size = cpu_to_be64(inode->i_size);
- ctime = inode_get_ctime(inode).tv_sec * 1000LL +
- ((inode_get_ctime(inode).tv_nsec + 999)/1000);
+ ctime = inode_get_ctime_sec(inode) * 1000LL +
+ ((inode_get_ctime_nsec(inode) + 999)/1000);
oi->i_ctime = cpu_to_be64(ctime);
omfs_update_checksums(oi);
@@ -230,11 +230,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
ctime = be64_to_cpu(oi->i_ctime);
nsecs = do_div(ctime, 1000) * 1000L;
- inode->i_atime.tv_sec = ctime;
- inode->i_mtime.tv_sec = ctime;
+ inode_set_atime(inode, ctime, nsecs);
+ inode_set_mtime(inode, ctime, nsecs);
inode_set_ctime(inode, ctime, nsecs);
- inode->i_atime.tv_nsec = nsecs;
- inode->i_mtime.tv_nsec = nsecs;
inode->i_mapping->a_ops = &omfs_aops;
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..02dc608d40d8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
return ksys_fchown(fd, user, group);
}
+static inline int file_get_write_access(struct file *f)
+{
+ int error;
+
+ error = get_write_access(f->f_inode);
+ if (unlikely(error))
+ return error;
+ error = mnt_get_write_access(f->f_path.mnt);
+ if (unlikely(error))
+ goto cleanup_inode;
+ if (unlikely(f->f_mode & FMODE_BACKING)) {
+ error = mnt_get_write_access(backing_file_user_path(f)->mnt);
+ if (unlikely(error))
+ goto cleanup_mnt;
+ }
+ return 0;
+
+cleanup_mnt:
+ mnt_put_write_access(f->f_path.mnt);
+cleanup_inode:
+ put_write_access(f->f_inode);
+ return error;
+}
+
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
@@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_inc(inode);
} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
- error = get_write_access(inode);
+ error = file_get_write_access(f);
if (unlikely(error))
goto cleanup_file;
- error = __mnt_want_write(f->f_path.mnt);
- if (unlikely(error)) {
- put_write_access(inode);
- goto cleanup_file;
- }
f->f_mode |= FMODE_WRITER;
}
@@ -1163,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
/**
* backing_file_open - open a backing file for kernel internal use
- * @path: path of the file to open
+ * @user_path: path that the user reuqested to open
* @flags: open flags
* @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem. In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
*/
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
const struct path *real_path,
const struct cred *cred)
{
@@ -1187,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags,
if (IS_ERR(f))
return f;
- f->f_path = *path;
- path_get(real_path);
- *backing_file_real_path(f) = *real_path;
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ f->f_path = *real_path;
error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
if (error) {
fput(f);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index b2457cb97fa0..c4b65a6d41cc 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -237,7 +237,7 @@ found:
if (IS_ERR(inode))
return ERR_CAST(inode);
if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
ent_oi = OP_I(inode);
ent_oi->type = ent_type;
ent_oi->u = ent_data;
@@ -387,7 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
goto out_no_root;
}
- root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode);
+ simple_inode_init_ts(root_inode);
root_inode->i_op = &openprom_inode_operations;
root_inode->i_fop = &openprom_operations;
root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b711654ca18a..926d9c0a428a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,7 +103,7 @@ enum orangefs_vfs_op_states {
#define ORANGEFS_CACHE_CREATE_FLAGS 0
#endif
-extern const struct xattr_handler *orangefs_xattr_handlers[];
+extern const struct xattr_handler * const orangefs_xattr_handlers[];
extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
extern int orangefs_set_acl(struct mnt_idmap *idmap,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 0a9fcfdf552f..0fdceb00ca07 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -155,14 +155,14 @@ static inline void copy_attributes_from_inode(struct inode *inode,
if (orangefs_inode->attr_valid & ATTR_ATIME) {
attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
if (orangefs_inode->attr_valid & ATTR_ATIME_SET) {
- attrs->atime = (time64_t)inode->i_atime.tv_sec;
+ attrs->atime = (time64_t) inode_get_atime_sec(inode);
attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
}
}
if (orangefs_inode->attr_valid & ATTR_MTIME) {
attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
if (orangefs_inode->attr_valid & ATTR_MTIME_SET) {
- attrs->mtime = (time64_t)inode->i_mtime.tv_sec;
+ attrs->mtime = (time64_t) inode_get_mtime_sec(inode);
attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
}
}
@@ -357,15 +357,15 @@ again2:
downcall.resp.getattr.attributes.owner);
inode->i_gid = make_kgid(&init_user_ns, new_op->
downcall.resp.getattr.attributes.group);
- inode->i_atime.tv_sec = (time64_t)new_op->
- downcall.resp.getattr.attributes.atime;
- inode->i_mtime.tv_sec = (time64_t)new_op->
- downcall.resp.getattr.attributes.mtime;
+ inode_set_atime(inode,
+ (time64_t)new_op->downcall.resp.getattr.attributes.atime,
+ 0);
+ inode_set_mtime(inode,
+ (time64_t)new_op->downcall.resp.getattr.attributes.mtime,
+ 0);
inode_set_ctime(inode,
(time64_t)new_op->downcall.resp.getattr.attributes.ctime,
0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
/* special case: mark the root inode as sticky */
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 68b62689a63e..74ef75586f38 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -554,7 +554,7 @@ static const struct xattr_handler orangefs_xattr_default_handler = {
.set = orangefs_xattr_set_default,
};
-const struct xattr_handler *orangefs_xattr_handlers[] = {
+const struct xattr_handler * const orangefs_xattr_handlers[] = {
&orangefs_xattr_default_handler,
NULL
};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index bae404a1bad4..ada3fcc9c6d5 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
{
struct iattr attr = {
.ia_valid =
- ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
@@ -618,7 +618,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
if (err)
return err;
- if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) {
+ if (inode->i_flags & OVL_COPY_I_FLAGS_MASK &&
+ (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) {
/*
* Copy the fileattr inode flags that are the source of already
* copied i_flags
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index c8c8588bd98c..26b782c53910 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -188,7 +188,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
/* Lower file handle for non-upper non-decodable */
if (!ovl_dentry_upper(dentry) && !decodable)
- return 0;
+ return 1;
/* Upper file handle for pure upper */
if (!ovl_dentry_lower(dentry))
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 3b4cc633d763..ec3671ca140c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -19,7 +19,6 @@ struct ovl_aio_req {
struct kiocb iocb;
refcount_t ref;
struct kiocb *orig_iocb;
- struct fd fd;
};
static struct kmem_cache *ovl_aio_request_cachep;
@@ -240,6 +239,7 @@ static void ovl_file_accessed(struct file *file)
{
struct inode *inode, *upperinode;
struct timespec64 ctime, uctime;
+ struct timespec64 mtime, umtime;
if (file->f_flags & O_NOATIME)
return;
@@ -252,9 +252,11 @@ static void ovl_file_accessed(struct file *file)
ctime = inode_get_ctime(inode);
uctime = inode_get_ctime(upperinode);
- if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
- !timespec64_equal(&ctime, &uctime))) {
- inode->i_mtime = upperinode->i_mtime;
+ mtime = inode_get_mtime(inode);
+ umtime = inode_get_mtime(upperinode);
+ if ((!timespec64_equal(&mtime, &umtime)) ||
+ !timespec64_equal(&ctime, &uctime)) {
+ inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode));
inode_set_ctime_to_ts(inode, uctime);
}
@@ -280,7 +282,7 @@ static rwf_t ovl_iocb_to_rwf(int ifl)
static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
{
if (refcount_dec_and_test(&aio_req->ref)) {
- fdput(aio_req->fd);
+ fput(aio_req->iocb.ki_filp);
kmem_cache_free(ovl_aio_request_cachep, aio_req);
}
}
@@ -342,10 +344,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!aio_req)
goto out;
- aio_req->fd = real;
- real.flags = 0;
aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, real.file);
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
@@ -393,6 +393,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
+ /*
+ * Overlayfs doesn't support deferred completions, don't copy
+ * this property in case it is set by the issuer.
+ */
+ ifl &= ~IOCB_DIO_CALLER_COMP;
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
file_start_write(real.file);
@@ -409,10 +415,8 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!aio_req)
goto out;
- aio_req->fd = real;
- real.flags = 0;
aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, real.file);
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
refcount_set(&aio_req->ref, 2);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 83ef66644c21..b6e98a7d36ce 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -704,7 +704,8 @@ int ovl_update_time(struct inode *inode, int flags)
if (upperpath.dentry) {
touch_atime(&upperpath);
- inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+ inode_set_atime_to_ts(inode,
+ inode_get_atime(d_inode(upperpath.dentry)));
}
}
return 0;
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index e9539f98e86a..d82d2a043da2 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -8,6 +8,7 @@
struct ovl_config {
char *upperdir;
char *workdir;
+ char **lowerdirs;
bool default_permissions;
int redirect_mode;
int verity_mode;
@@ -39,17 +40,8 @@ struct ovl_layer {
int idx;
/* One fsid per unique underlying sb (upper fsid == 0) */
int fsid;
- char *name;
};
-/*
- * ovl_free_fs() relies on @mnt being the first member when unmounting
- * the private mounts created for each layer. Let's check both the
- * offset and type.
- */
-static_assert(offsetof(struct ovl_layer, mnt) == 0);
-static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *));
-
struct ovl_path {
const struct ovl_layer *layer;
struct dentry *dentry;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index b9355bb6d75a..f6ff23fd101c 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -157,6 +157,34 @@ const struct fs_parameter_spec ovl_parameter_spec[] = {
{}
};
+static char *ovl_next_opt(char **s)
+{
+ char *sbegin = *s;
+ char *p;
+
+ if (sbegin == NULL)
+ return NULL;
+
+ for (p = sbegin; *p; p++) {
+ if (*p == '\\') {
+ p++;
+ if (!*p)
+ break;
+ } else if (*p == ',') {
+ *p = '\0';
+ *s = p + 1;
+ return sbegin;
+ }
+ }
+ *s = NULL;
+ return sbegin;
+}
+
+static int ovl_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, ovl_next_opt);
+}
+
static ssize_t ovl_parse_param_split_lowerdirs(char *str)
{
ssize_t nr_layers = 1, nr_colons = 0;
@@ -164,7 +192,8 @@ static ssize_t ovl_parse_param_split_lowerdirs(char *str)
for (s = d = str;; s++, d++) {
if (*s == '\\') {
- s++;
+ /* keep esc chars in split lowerdir */
+ *d++ = *s++;
} else if (*s == ':') {
bool next_colon = (*(s + 1) == ':');
@@ -239,7 +268,7 @@ static void ovl_unescape(char *s)
}
}
-static int ovl_mount_dir(const char *name, struct path *path)
+static int ovl_mount_dir(const char *name, struct path *path, bool upper)
{
int err = -ENOMEM;
char *tmp = kstrdup(name, GFP_KERNEL);
@@ -248,7 +277,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
- if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
+ if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) {
pr_err("filesystem on '%s' not supported as upperdir\n",
tmp);
path_put_init(path);
@@ -269,7 +298,7 @@ static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc,
struct path path;
char *dup;
- err = ovl_mount_dir(name, &path);
+ err = ovl_mount_dir(name, &path, true);
if (err)
return err;
@@ -321,12 +350,6 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
* Set "/lower1", "/lower2", and "/lower3" as lower layers and
* "/data1" and "/data2" as data lower layers. Any existing lower
* layers are replaced.
- * (2) lowerdir=:/lower4
- * Append "/lower4" to current stack of lower layers. This requires
- * that there already is at least one lower layer configured.
- * (3) lowerdir=::/lower5
- * Append data "/lower5" as data lower layer. This requires that
- * there's at least one regular lower layer present.
*/
static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
{
@@ -348,49 +371,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
return 0;
}
- if (strncmp(name, "::", 2) == 0) {
- /*
- * This is a data layer.
- * There must be at least one regular lower layer
- * specified.
- */
- if (ctx->nr == 0) {
- pr_err("data lower layers without regular lower layers not allowed");
- return -EINVAL;
- }
-
- /* Skip the leading "::". */
- name += 2;
- data_layer = true;
- /*
- * A data layer is automatically an append as there
- * must've been at least one regular lower layer.
- */
- append = true;
- } else if (*name == ':') {
- /*
- * This is a regular lower layer.
- * If users want to append a layer enforce that they
- * have already specified a first layer before. It's
- * better to be strict.
- */
- if (ctx->nr == 0) {
- pr_err("cannot append layer if no previous layer has been specified");
- return -EINVAL;
- }
-
- /*
- * Once a sequence of data layers has started regular
- * lower layers are forbidden.
- */
- if (ctx->nr_data > 0) {
- pr_err("regular lower layers cannot follow data lower layers");
- return -EINVAL;
- }
-
- /* Skip the leading ":". */
- name++;
- append = true;
+ if (*name == ':') {
+ pr_err("cannot append lower layer");
+ return -EINVAL;
}
dup = kstrdup(name, GFP_KERNEL);
@@ -472,7 +455,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
l = &ctx->lower[nr];
memset(l, 0, sizeof(*l));
- err = ovl_mount_dir_noesc(dup_iter, &l->path);
+ err = ovl_mount_dir(dup_iter, &l->path, false);
if (err)
goto out_put;
@@ -682,6 +665,7 @@ static int ovl_reconfigure(struct fs_context *fc)
}
static const struct fs_context_operations ovl_context_ops = {
+ .parse_monolithic = ovl_parse_monolithic,
.parse_param = ovl_parse_param,
.get_tree = ovl_get_tree,
.reconfigure = ovl_reconfigure,
@@ -752,12 +736,12 @@ void ovl_free_fs(struct ovl_fs *ofs)
if (ofs->upperdir_locked)
ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
- /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */
- mounts = (struct vfsmount **) ofs->layers;
+ /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */
+ mounts = (struct vfsmount **) ofs->config.lowerdirs;
for (i = 0; i < ofs->numlayer; i++) {
iput(ofs->layers[i].trap);
+ kfree(ofs->config.lowerdirs[i]);
mounts[i] = ofs->layers[i].mnt;
- kfree(ofs->layers[i].name);
}
kern_unmount_array(mounts, ofs->numlayer);
kfree(ofs->layers);
@@ -765,6 +749,7 @@ void ovl_free_fs(struct ovl_fs *ofs)
free_anon_bdev(ofs->fs[i].pseudo_dev);
kfree(ofs->fs);
+ kfree(ofs->config.lowerdirs);
kfree(ofs->config.upperdir);
kfree(ofs->config.workdir);
if (ofs->creator_cred)
@@ -949,16 +934,23 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ofs = OVL_FS(sb);
size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer;
- const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower];
-
- /* ofs->layers[0] is the upper layer */
- seq_printf(m, ",lowerdir=%s", ofs->layers[1].name);
- /* dump regular lower layers */
- for (nr = 2; nr < nr_merged_lower; nr++)
- seq_printf(m, ":%s", ofs->layers[nr].name);
- /* dump data lower layers */
- for (nr = 0; nr < ofs->numdatalayer; nr++)
- seq_printf(m, "::%s", data_layers[nr].name);
+
+ /*
+ * lowerdirs[] starts from offset 1, then
+ * >= 0 regular lower layers prefixed with : and
+ * >= 0 data-only lower layers prefixed with ::
+ *
+ * we need to escase comma and space like seq_show_option() does and
+ * we also need to escape the colon separator from lowerdir paths.
+ */
+ seq_puts(m, ",lowerdir=");
+ for (nr = 1; nr < ofs->numlayer; nr++) {
+ if (nr > 1)
+ seq_putc(m, ':');
+ if (nr >= nr_merged_lower)
+ seq_putc(m, ':');
+ seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\");
+ }
if (ofs->config.upperdir) {
seq_show_option(m, "upperdir", ofs->config.upperdir);
seq_show_option(m, "workdir", ofs->config.workdir);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index def266b5e2a3..6cd949c59fed 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
struct dentry *real = NULL, *lower;
int err;
- /* It's an overlay file */
+ /*
+ * vfs is only expected to call d_real() with NULL from d_real_inode()
+ * and with overlay inode from file_dentry() on an overlay file.
+ *
+ * TODO: remove @inode argument from d_real() API, remove code in this
+ * function that deals with non-NULL @inode and remove d_real() call
+ * from file_dentry().
+ */
if (inode && d_inode(dentry) == inode)
return dentry;
+ else if (inode)
+ goto bug;
if (!d_is_reg(dentry)) {
- if (!inode || inode == d_inode(dentry))
- return dentry;
- goto bug;
+ /* d_real_inode() is only relevant for regular files */
+ return dentry;
}
real = ovl_dentry_upper(dentry);
@@ -104,8 +112,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
static int ovl_dentry_revalidate_common(struct dentry *dentry,
unsigned int flags, bool weak)
{
- struct ovl_entry *oe = OVL_E(dentry);
- struct ovl_path *lowerstack = ovl_lowerstack(oe);
+ struct ovl_entry *oe;
+ struct ovl_path *lowerstack;
struct inode *inode = d_inode_rcu(dentry);
struct dentry *upper;
unsigned int i;
@@ -115,6 +123,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
if (!inode)
return -ECHILD;
+ oe = OVL_I_E(inode);
+ lowerstack = ovl_lowerstack(oe);
upper = ovl_i_dentry_upper(inode);
if (upper)
ret = ovl_revalidate_real(upper, flags, weak);
@@ -167,6 +177,7 @@ static void ovl_free_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
kfree(oi->redirect);
+ kfree(oi->oe);
mutex_destroy(&oi->lock);
kmem_cache_free(ovl_inode_cachep, oi);
}
@@ -176,7 +187,7 @@ static void ovl_destroy_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
dput(oi->__upperdentry);
- ovl_free_entry(oi->oe);
+ ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe));
if (S_ISDIR(inode->i_mode))
ovl_dir_cache_free(inode);
else
@@ -484,13 +495,13 @@ static const struct xattr_handler ovl_other_xattr_handler = {
.set = ovl_other_xattr_set,
};
-static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
+static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = {
&ovl_own_trusted_xattr_handler,
&ovl_other_xattr_handler,
NULL
};
-static const struct xattr_handler *ovl_user_xattr_handlers[] = {
+static const struct xattr_handler * const ovl_user_xattr_handlers[] = {
&ovl_own_user_xattr_handler,
&ovl_other_xattr_handler,
NULL
@@ -569,11 +580,6 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
upper_layer->idx = 0;
upper_layer->fsid = 0;
- err = -ENOMEM;
- upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL);
- if (!upper_layer->name)
- goto out;
-
/*
* Inherit SB_NOSEC flag from upperdir.
*
@@ -1122,7 +1128,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
layers[ofs->numlayer].idx = ofs->numlayer;
layers[ofs->numlayer].fsid = fsid;
layers[ofs->numlayer].fs = &ofs->fs[fsid];
- layers[ofs->numlayer].name = l->name;
+ /* Store for printing lowerdir=... in ovl_show_options() */
+ ofs->config.lowerdirs[ofs->numlayer] = l->name;
l->name = NULL;
ofs->numlayer++;
ofs->fs[fsid].is_lower = true;
@@ -1367,8 +1374,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (!layers)
goto out_err;
+ ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL);
+ if (!ofs->config.lowerdirs) {
+ kfree(layers);
+ goto out_err;
+ }
ofs->layers = layers;
- /* Layer 0 is reserved for upper even if there's no upper */
+ /*
+ * Layer 0 is reserved for upper even if there's no upper.
+ * For consistency, config.lowerdirs[0] is NULL.
+ */
ofs->numlayer = 1;
sb->s_stack_depth = 0;
@@ -1481,8 +1496,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
+#ifdef CONFIG_FS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
+#endif
sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ /*
+ * Ensure that umask handling is done by the filesystems used
+ * for the the upper layer instead of overlayfs as that would
+ * lead to unexpected results.
+ */
+ sb->s_iflags |= SB_I_NOUMASK;
err = -ENOMEM;
root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 89e0d60d35b6..868afd8834c3 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1409,8 +1409,8 @@ void ovl_copyattr(struct inode *inode)
inode->i_uid = vfsuid_into_kuid(vfsuid);
inode->i_gid = vfsgid_into_kgid(vfsgid);
inode->i_mode = realinode->i_mode;
- inode->i_atime = realinode->i_atime;
- inode->i_mtime = realinode->i_mtime;
+ inode_set_atime_to_ts(inode, inode_get_atime(realinode));
+ inode_set_mtime_to_ts(inode, inode_get_mtime(realinode));
inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
i_size_write(inode, i_size_read(realinode));
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 6c1a9b1db907..8916c455a469 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe)
return !pipe_empty(head, tail) || !writers;
}
+static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf,
+ unsigned int tail)
+{
+ pipe_buf_release(pipe, buf);
+
+ /*
+ * If the pipe has a watch_queue, we need additional protection
+ * by the spinlock because notifications get posted with only
+ * this spinlock, no mutex
+ */
+ if (pipe_has_watch_queue(pipe)) {
+ spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+ if (buf->flags & PIPE_BUF_FLAG_LOSS)
+ pipe->note_loss = true;
+#endif
+ pipe->tail = ++tail;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ return tail;
+ }
+
+ /*
+ * Without a watch_queue, we can simply increment the tail
+ * without the spinlock - the mutex is enough.
+ */
+ pipe->tail = ++tail;
+ return tail;
+}
+
static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
@@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
buf->len = 0;
}
- if (!buf->len) {
- pipe_buf_release(pipe, buf);
- spin_lock_irq(&pipe->rd_wait.lock);
-#ifdef CONFIG_WATCH_QUEUE
- if (buf->flags & PIPE_BUF_FLAG_LOSS)
- pipe->note_loss = true;
-#endif
- tail++;
- pipe->tail = tail;
- spin_unlock_irq(&pipe->rd_wait.lock);
- }
+ if (!buf->len)
+ tail = pipe_update_tail(pipe, buf, tail);
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
@@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue) {
+ if (pipe_has_watch_queue(pipe)) {
ret = -EXDEV;
goto out;
}
-#endif
/*
* If it wasn't empty we try to merge new data into
@@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
- spin_lock_irq(&pipe->rd_wait.lock);
-
- head = pipe->head;
- if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- spin_unlock_irq(&pipe->rd_wait.lock);
- continue;
- }
-
pipe->head = head + 1;
- spin_unlock_irq(&pipe->rd_wait.lock);
/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
@@ -537,7 +547,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
break;
}
ret += copied;
- buf->offset = 0;
buf->len = copied;
if (!iov_iter_count(from))
@@ -899,7 +908,7 @@ static struct inode * get_pipe_inode(void)
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
return inode;
@@ -1325,10 +1334,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
unsigned int nr_slots, size;
long ret = 0;
-#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue)
+ if (pipe_has_watch_queue(pipe))
return -EBUSY;
-#endif
size = round_pipe_size(arg);
nr_slots = size >> PAGE_SHIFT;
@@ -1380,10 +1387,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
if (file->f_op != &pipefifo_fops || !pipe)
return NULL;
-#ifdef CONFIG_WATCH_QUEUE
- if (for_splice && pipe->watch_queue)
+ if (for_splice && pipe_has_watch_queue(pipe))
return NULL;
-#endif
return pipe;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..83396ab14998 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1902,7 +1902,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb,
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &proc_def_inode_operations;
/*
@@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
rc = -ENOENT;
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
- *path = vma->vm_file->f_path;
+ *path = *file_user_path(vma->vm_file);
path_get(path);
rc = 0;
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
struct file *file;
rcu_read_lock();
- file = task_lookup_fd_rcu(task, fd);
- if (file)
- *mode = file->f_mode;
+ file = task_lookup_fdget_rcu(task, fd);
rcu_read_unlock();
+ if (file) {
+ *mode = file->f_mode;
+ fput(file);
+ }
return !!file;
}
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fd_rcu(p, &fd);
+ f = task_lookup_next_fdget_rcu(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
rcu_read_unlock();
+ fput(f);
data.fd = fd;
len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 532dc9d240f7..592ed2516f47 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -660,7 +660,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_private = de->data;
inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
make_empty_dir_inode(inode);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9dda7e54b2d0..9a8f32f21ff5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -289,9 +289,7 @@ struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
struct mm_struct *mm;
-#ifdef CONFIG_MMU
struct vma_iterator iter;
-#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
}
seq_putc(m, '\n');
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c88854df0b62..bc9a2db89cfa 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -465,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
head->count++;
spin_unlock(&sysctl_lock);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ecc4da8d265e..b46fbfd22681 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..1593940ca01e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
if (anon_name)
seq_printf(m, "[anon_shmem:%s]", anon_name->name);
else
- seq_file_path(m, file, "\n");
+ seq_path(m, file_user_path(file), "\n");
goto done;
}
@@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
- seq_file_path(m, file, "\n\t= ");
+ seq_path(m, file_user_path(file), "\n\t= ");
} else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a8ac0dd8041e..bce674533000 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "");
+ seq_path(m, file_user_path(file), "");
} else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
@@ -175,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p)
return nommu_vma_show(m, _p);
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -1UL;
+ }
+
+ return vma;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
- unsigned long addr = *pos;
- /* See m_next(). Zero at the start or after lseek. */
- if (addr == -1UL)
+ /* See proc_get_vma(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
return NULL;
/* pin the task and mm whilst we play with them */
@@ -192,44 +205,41 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-ESRCH);
mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (mmap_read_lock_killable(mm)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
- /* start the next element from addr */
- vma = find_vma(mm, addr);
- if (vma)
- return vma;
+ vma_iter_init(&priv->iter, mm, last_addr);
- mmap_read_unlock(mm);
- mmput(mm);
- return NULL;
+ return proc_get_vma(priv, ppos);
}
-static void m_stop(struct seq_file *m, void *_vml)
+static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
- if (!IS_ERR_OR_NULL(_vml)) {
- mmap_read_unlock(priv->mm);
- mmput(priv->mm);
- }
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
-static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *ppos)
{
- struct vm_area_struct *vma = _p;
-
- *pos = vma->vm_end;
- return find_vma(vma->vm_mm, vma->vm_end);
+ return proc_get_vma(m->private, ppos);
}
static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 63ac1f93289f..0e5050d6ab64 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 585360706b33..d41c20d1b5e8 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
@@ -390,7 +390,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
inode->i_private = private;
if (record->time.tv_sec)
- inode->i_mtime = inode_set_ctime_to_ts(inode, record->time);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, record->time));
d_add(dentry, inode);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index a7171f5532a1..6eb9bb369b57 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -301,10 +301,8 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid));
set_nlink(inode, le16_to_cpu(raw_inode->di_nlink));
inode->i_size = le32_to_cpu(raw_inode->di_size);
- inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime);
- inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode, le32_to_cpu(raw_inode->di_mtime), 0);
+ inode_set_atime(inode, le32_to_cpu(raw_inode->di_atime), 0);
inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0);
inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 21f90d519f1a..a286c545717f 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -558,10 +558,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid));
inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size);
- inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime);
- inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0);
+ inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0);
inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0);
/* calc blocks based on 512 byte blocksize */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 9e72bfe8bbad..31e897ad5e6a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -233,19 +233,18 @@ static void put_quota_format(struct quota_format_type *fmt)
* All dquots are placed to the end of inuse_list when first created, and this
* list is used for invalidate operation, which must look at every dquot.
*
- * When the last reference of a dquot will be dropped, the dquot will be
- * added to releasing_dquots. We'd then queue work item which would call
+ * When the last reference of a dquot is dropped, the dquot is added to
+ * releasing_dquots. We'll then queue work item which will call
* synchronize_srcu() and after that perform the final cleanup of all the
- * dquots on the list. Both releasing_dquots and free_dquots use the
- * dq_free list_head in the dquot struct. When a dquot is removed from
- * releasing_dquots, a reference count is always subtracted, and if
- * dq_count == 0 at that point, the dquot will be added to the free_dquots.
+ * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
+ * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
+ * struct.
*
- * Unused dquots (dq_count == 0) are added to the free_dquots list when freed,
- * and this list is searched whenever we need an available dquot. Dquots are
- * removed from the list as soon as they are used again, and
- * dqstats.free_dquots gives the number of dquots on the list. When
- * dquot is invalidated it's completely released from memory.
+ * Unused and cleaned up dquots are in the free_dquots list and this list is
+ * searched whenever we need an available dquot. Dquots are removed from the
+ * list as soon as they are used again and dqstats.free_dquots gives the number
+ * of dquots on the list. When dquot is invalidated it's completely released
+ * from memory.
*
* Dirty dquots are added to the dqi_dirty_list of quota_info when mark
* dirtied, and this list is searched when writing dirty dquots back to
@@ -321,6 +320,7 @@ static inline void put_dquot_last(struct dquot *dquot)
static inline void put_releasing_dquots(struct dquot *dquot)
{
list_add_tail(&dquot->dq_free, &releasing_dquots);
+ set_bit(DQ_RELEASING_B, &dquot->dq_flags);
}
static inline void remove_free_dquot(struct dquot *dquot)
@@ -328,8 +328,10 @@ static inline void remove_free_dquot(struct dquot *dquot)
if (list_empty(&dquot->dq_free))
return;
list_del_init(&dquot->dq_free);
- if (!atomic_read(&dquot->dq_count))
+ if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
dqstats_dec(DQST_FREE_DQUOTS);
+ else
+ clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}
static inline void put_inuse(struct dquot *dquot)
@@ -581,12 +583,6 @@ restart:
continue;
/* Wait for dquot users */
if (atomic_read(&dquot->dq_count)) {
- /* dquot in releasing_dquots, flush and retry */
- if (!list_empty(&dquot->dq_free)) {
- spin_unlock(&dq_list_lock);
- goto restart;
- }
-
atomic_inc(&dquot->dq_count);
spin_unlock(&dq_list_lock);
/*
@@ -606,6 +602,15 @@ restart:
goto restart;
}
/*
+ * The last user already dropped its reference but dquot didn't
+ * get fully cleaned up yet. Restart the scan which flushes the
+ * work cleaning up released dquots.
+ */
+ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+ spin_unlock(&dq_list_lock);
+ goto restart;
+ }
+ /*
* Quota now has no users and it has been written on last
* dqput()
*/
@@ -696,6 +701,13 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
dq_dirty);
WARN_ON(!dquot_active(dquot));
+ /* If the dquot is releasing we should not touch it */
+ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+ spin_unlock(&dq_list_lock);
+ flush_delayed_work(&quota_release_work);
+ spin_lock(&dq_list_lock);
+ continue;
+ }
/* Now we have active dquot from which someone is
* holding reference so we can safely just increase
@@ -809,18 +821,18 @@ static void quota_release_workfn(struct work_struct *work)
/* Exchange the list head to avoid livelock. */
list_replace_init(&releasing_dquots, &rls_head);
spin_unlock(&dq_list_lock);
+ synchronize_srcu(&dquot_srcu);
restart:
- synchronize_srcu(&dquot_srcu);
spin_lock(&dq_list_lock);
while (!list_empty(&rls_head)) {
dquot = list_first_entry(&rls_head, struct dquot, dq_free);
- /* Dquot got used again? */
- if (atomic_read(&dquot->dq_count) > 1) {
- remove_free_dquot(dquot);
- atomic_dec(&dquot->dq_count);
- continue;
- }
+ WARN_ON_ONCE(atomic_read(&dquot->dq_count));
+ /*
+ * Note that DQ_RELEASING_B protects us from racing with
+ * invalidate_dquots() calls so we are safe to work with the
+ * dquot even after we drop dq_list_lock.
+ */
if (dquot_dirty(dquot)) {
spin_unlock(&dq_list_lock);
/* Commit dquot before releasing */
@@ -834,7 +846,6 @@ restart:
}
/* Dquot is inactive and clean, now move it to free list */
remove_free_dquot(dquot);
- atomic_dec(&dquot->dq_count);
put_dquot_last(dquot);
}
spin_unlock(&dq_list_lock);
@@ -875,6 +886,7 @@ void dqput(struct dquot *dquot)
BUG_ON(!list_empty(&dquot->dq_free));
#endif
put_releasing_dquots(dquot);
+ atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
}
@@ -963,7 +975,7 @@ we_slept:
dqstats_inc(DQST_LOOKUPS);
}
/* Wait for dq_lock - after this we know that either dquot_release() is
- * already finished or it will be canceled due to dq_count > 1 test */
+ * already finished or it will be canceled due to dq_count > 0 test */
wait_on_dquot(dquot);
/* Read the dquot / allocate space in quota file */
if (!dquot_active(dquot)) {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 18e8387cab41..4ac05a9e25bc 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode->i_mapping->a_ops = &ram_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
@@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
return error;
}
@@ -138,7 +138,8 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
if (!error) {
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_current(dir));
} else
iput(inode);
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 86e55d4bb10d..c8572346556f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1257,11 +1257,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
i_uid_write(inode, sd_v1_uid(sd));
i_gid_write(inode, sd_v1_gid(sd));
inode->i_size = sd_v1_size(sd);
- inode->i_atime.tv_sec = sd_v1_atime(sd);
- inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+ inode_set_atime(inode, sd_v1_atime(sd), 0);
+ inode_set_mtime(inode, sd_v1_mtime(sd), 0);
inode_set_ctime(inode, sd_v1_ctime(sd), 0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
inode->i_blocks = sd_v1_blocks(sd);
inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -1311,11 +1309,9 @@ static void init_inode(struct inode *inode, struct treepath *path)
i_uid_write(inode, sd_v2_uid(sd));
inode->i_size = sd_v2_size(sd);
i_gid_write(inode, sd_v2_gid(sd));
- inode->i_mtime.tv_sec = sd_v2_mtime(sd);
- inode->i_atime.tv_sec = sd_v2_atime(sd);
+ inode_set_mtime(inode, sd_v2_mtime(sd), 0);
+ inode_set_atime(inode, sd_v2_atime(sd), 0);
inode_set_ctime(inode, sd_v2_ctime(sd), 0);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
inode->i_blocks = sd_v2_blocks(sd);
rdev = sd_v2_rdev(sd);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -1370,9 +1366,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_uid(sd_v2, i_uid_read(inode));
set_sd_v2_size(sd_v2, size);
set_sd_v2_gid(sd_v2, i_gid_read(inode));
- set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
- set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
- set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec);
+ set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
+ set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
+ set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
@@ -1391,9 +1387,9 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
set_sd_v1_gid(sd_v1, i_gid_read(inode));
set_sd_v1_nlink(sd_v1, inode->i_nlink);
set_sd_v1_size(sd_v1, size);
- set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
- set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec);
- set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
+ set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
+ set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
+ set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
@@ -1984,7 +1980,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
/* uid and gid must already be set by the caller for quota init */
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_size = i_size;
inode->i_blocks = 0;
inode->i_bytes = 0;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 015bfe4e4524..171c912af50f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -90,8 +90,7 @@ static int flush_commit_list(struct super_block *s,
static int can_dirty(struct reiserfs_journal_cnode *cn);
static int journal_join(struct reiserfs_transaction_handle *th,
struct super_block *sb);
-static void release_journal_dev(struct super_block *super,
- struct reiserfs_journal *journal);
+static void release_journal_dev(struct reiserfs_journal *journal);
static void dirty_one_transaction(struct super_block *s,
struct reiserfs_journal_list *jl);
static void flush_async_commits(struct work_struct *work);
@@ -1893,7 +1892,7 @@ static void free_journal_ram(struct super_block *sb)
* j_header_bh is on the journal dev, make sure
* not to release the journal dev until we brelse j_header_bh
*/
- release_journal_dev(sb, journal);
+ release_journal_dev(journal);
vfree(journal);
}
@@ -2387,7 +2386,7 @@ static int journal_read(struct super_block *sb)
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
reiserfs_info(sb, "checking transaction log (%pg)\n",
- journal->j_dev_bd);
+ journal->j_bdev_handle->bdev);
start = ktime_get_seconds();
/*
@@ -2448,7 +2447,7 @@ static int journal_read(struct super_block *sb)
* device and journal device to be the same
*/
d_bh =
- reiserfs_breada(journal->j_dev_bd, cur_dblock,
+ reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
sb->s_blocksize,
SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2587,17 +2586,11 @@ static void journal_list_init(struct super_block *sb)
SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
}
-static void release_journal_dev(struct super_block *super,
- struct reiserfs_journal *journal)
+static void release_journal_dev(struct reiserfs_journal *journal)
{
- if (journal->j_dev_bd != NULL) {
- void *holder = NULL;
-
- if (journal->j_dev_bd->bd_dev != super->s_dev)
- holder = journal;
-
- blkdev_put(journal->j_dev_bd, holder);
- journal->j_dev_bd = NULL;
+ if (journal->j_bdev_handle) {
+ bdev_release(journal->j_bdev_handle);
+ journal->j_bdev_handle = NULL;
}
}
@@ -2612,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
result = 0;
- journal->j_dev_bd = NULL;
+ journal->j_bdev_handle = NULL;
jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
@@ -2623,36 +2616,37 @@ static int journal_init_dev(struct super_block *super,
if ((!jdev_name || !jdev_name[0])) {
if (jdev == super->s_dev)
holder = NULL;
- journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder,
- NULL);
- if (IS_ERR(journal->j_dev_bd)) {
- result = PTR_ERR(journal->j_dev_bd);
- journal->j_dev_bd = NULL;
+ journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+ holder, NULL);
+ if (IS_ERR(journal->j_bdev_handle)) {
+ result = PTR_ERR(journal->j_bdev_handle);
+ journal->j_bdev_handle = NULL;
reiserfs_warning(super, "sh-458",
"cannot init journal device unknown-block(%u,%u): %i",
MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
- set_blocksize(journal->j_dev_bd, super->s_blocksize);
+ set_blocksize(journal->j_bdev_handle->bdev,
+ super->s_blocksize);
return 0;
}
- journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder,
- NULL);
- if (IS_ERR(journal->j_dev_bd)) {
- result = PTR_ERR(journal->j_dev_bd);
- journal->j_dev_bd = NULL;
+ journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+ holder, NULL);
+ if (IS_ERR(journal->j_bdev_handle)) {
+ result = PTR_ERR(journal->j_bdev_handle);
+ journal->j_bdev_handle = NULL;
reiserfs_warning(super, "sh-457",
"journal_init_dev: Cannot open '%s': %i",
jdev_name, result);
return result;
}
- set_blocksize(journal->j_dev_bd, super->s_blocksize);
+ set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
reiserfs_info(super,
"journal_init_dev: journal device: %pg\n",
- journal->j_dev_bd);
+ journal->j_bdev_handle->bdev);
return 0;
}
@@ -2810,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
"journal header magic %x (device %pg) does "
"not match to magic found in super block %x",
jh->jh_journal.jp_journal_magic,
- journal->j_dev_bd,
+ journal->j_bdev_handle->bdev,
sb_jp_journal_magic(rs));
brelse(bhjh);
goto free_and_return;
@@ -2834,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
reiserfs_info(sb, "journal params: device %pg, size %u, "
"journal first block %u, max trans len %u, max batch %u, "
"max commit age %u, max trans age %u\n",
- journal->j_dev_bd,
+ journal->j_bdev_handle->bdev,
SB_ONDISK_JOURNAL_SIZE(sb),
SB_ONDISK_JOURNAL_1st_BLOCK(sb),
journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 9c5704be2435..994d6e6995ab 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
}
dir->i_size += paste_size;
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
if (!S_ISDIR(inode->i_mode) && visible)
/* reiserfs_mkdir or reiserfs_rename will do that by itself */
reiserfs_update_sd(th, dir);
@@ -966,8 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_nlink);
clear_nlink(inode);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
reiserfs_update_sd(&th, inode);
DEC_DIR_INODE_NLINK(dir)
@@ -1075,7 +1075,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
reiserfs_update_sd(&th, inode);
dir->i_size -= (de.de_entrylen + DEH_SIZE);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
reiserfs_update_sd(&th, dir);
if (!savelink)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 3dba8acf4e83..83cb9402e0f9 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
"prepare: \t%12lu\n"
"prepare_retry: \t%12lu\n",
DJP(jp_journal_1st_block),
- SB_JOURNAL(sb)->j_dev_bd,
+ SB_JOURNAL(sb)->j_bdev_handle->bdev,
DJP(jp_journal_dev),
DJP(jp_journal_size),
DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index b81749492ef9..725667880e62 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -299,7 +299,7 @@ struct reiserfs_journal {
/* oldest journal block. start here for traverse */
struct reiserfs_journal_cnode *j_first;
- struct block_device *j_dev_bd;
+ struct bdev_handle *j_bdev_handle;
/* first block on s_dev of reserved area journal */
int j_1st_reserved_block;
@@ -1165,7 +1165,7 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
-extern const struct xattr_handler *reiserfs_xattr_handlers[];
+extern const struct xattr_handler * const reiserfs_xattr_handlers[];
/*
* this says about version of key of all items (but stat data) the
@@ -2699,7 +2699,7 @@ struct reiserfs_iget_args {
#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
#define journal_trans_half(blocksize) \
- ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32))
+ ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
/* journal.c see journal.c for all the comments here */
@@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc {
__le32 j_len;
__le32 j_mount_id; /* mount id of this trans */
- __le32 j_realblock[1]; /* real locations for each block */
+ __le32 j_realblock[]; /* real locations for each block */
};
#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id)
@@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc {
struct reiserfs_journal_commit {
__le32 j_trans_id; /* must match j_trans_id from the desc block */
__le32 j_len; /* ditto */
- __le32 j_realblock[1]; /* real locations for each block */
+ __le32 j_realblock[]; /* real locations for each block */
};
#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
@@ -2809,9 +2809,12 @@ struct reiserfs_journal_header {
#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
/* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_find_get_block(s, block) __find_get_block(\
+ SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+ block, s->s_blocksize)
+#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+ block, s->s_blocksize)
enum reiserfs_bh_state_bits {
BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 3676e02a0232..2138ee7d271d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2003,7 +2003,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
pathrelse(&s_search_path);
if (update_timestamps) {
- inode->i_mtime = current_time(inode);
+ inode_set_mtime_to_ts(inode,
+ current_time(inode));
inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
@@ -2028,7 +2029,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
update_and_out:
if (update_timestamps) {
/* this is truncate, not file closing */
- inode->i_mtime = current_time(inode);
+ inode_set_mtime_to_ts(inode, current_time(inode));
inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7eaf36b3de12..67b5510beded 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2587,7 +2587,7 @@ out:
return err;
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6000964c2b80..998035a6388e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -780,7 +780,7 @@ static inline bool reiserfs_posix_acl_list(const char *name,
}
/* This is the implementation for the xattr plugin infrastructure */
-static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers,
+static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
const char *name, struct dentry *dentry)
{
if (handlers) {
@@ -911,7 +911,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler * const reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 5c35f6c76037..545ad44f96b8 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -322,7 +322,8 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
set_nlink(i, 1); /* Hard to decide.. */
i->i_size = be32_to_cpu(ri.size);
- i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0);
+ inode_set_mtime_to_ts(i,
+ inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0)));
/* set up mode and ops */
mode = romfs_modemap[nextfh & ROMFH_TYPE];
@@ -593,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev) {
sync_blockdev(sb->s_bdev);
- blkdev_put(sb->s_bdev, sb);
+ bdev_release(sb->s_bdev_handle);
}
#endif
}
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index b17f067e4ada..fe1bf5b6e0cb 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -15,6 +15,7 @@
static struct cached_fid *init_cached_dir(const char *path);
static void free_cached_dir(struct cached_fid *cfid);
static void smb2_close_cached_fid(struct kref *ref);
+static void cfids_laundromat_worker(struct work_struct *work);
static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
const char *path,
@@ -169,15 +170,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
return -ENOENT;
}
/*
- * At this point we either have a lease already and we can just
- * return it. If not we are guaranteed to be the only thread accessing
- * this cfid.
+ * Return cached fid if it has a lease. Otherwise, it is either a new
+ * entry or laundromat worker removed it from @cfids->entries. Caller
+ * will put last reference if the latter.
*/
+ spin_lock(&cfids->cfid_list_lock);
if (cfid->has_lease) {
+ spin_unlock(&cfids->cfid_list_lock);
*ret_cfid = cfid;
kfree(utf16_path);
return 0;
}
+ spin_unlock(&cfids->cfid_list_lock);
/*
* Skip any prefix paths in @path as lookup_positive_unlocked() ends up
@@ -294,9 +298,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
goto oshr_free;
}
}
+ spin_lock(&cfids->cfid_list_lock);
cfid->dentry = dentry;
cfid->time = jiffies;
cfid->has_lease = true;
+ spin_unlock(&cfids->cfid_list_lock);
oshr_free:
kfree(utf16_path);
@@ -305,24 +311,28 @@ oshr_free:
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
spin_lock(&cfids->cfid_list_lock);
- if (rc && !cfid->has_lease) {
- if (cfid->on_list) {
- list_del(&cfid->entry);
- cfid->on_list = false;
- cfids->num_entries--;
+ if (!cfid->has_lease) {
+ if (rc) {
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+ }
+ rc = -ENOENT;
+ } else {
+ /*
+ * We are guaranteed to have two references at this
+ * point. One for the caller and one for a potential
+ * lease. Release the Lease-ref so that the directory
+ * will be closed when the caller closes the cached
+ * handle.
+ */
+ spin_unlock(&cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ goto out;
}
- rc = -ENOENT;
}
spin_unlock(&cfids->cfid_list_lock);
- if (!rc && !cfid->has_lease) {
- /*
- * We are guaranteed to have two references at this point.
- * One for the caller and one for a potential lease.
- * Release the Lease-ref so that the directory will be closed
- * when the caller closes the cached handle.
- */
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- }
if (rc) {
if (cfid->is_open)
SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
@@ -330,7 +340,7 @@ oshr_free:
free_cached_dir(cfid);
cfid = NULL;
}
-
+out:
if (rc == 0) {
*ret_cfid = cfid;
atomic_inc(&tcon->num_remote_opens);
@@ -452,6 +462,9 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon)
struct cached_fid *cfid, *q;
LIST_HEAD(entry);
+ if (cfids == NULL)
+ return;
+
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
list_move(&cfid->entry, &entry);
@@ -569,53 +582,51 @@ static void free_cached_dir(struct cached_fid *cfid)
kfree(cfid);
}
-static int
-cifs_cfids_laundromat_thread(void *p)
+static void cfids_laundromat_worker(struct work_struct *work)
{
- struct cached_fids *cfids = p;
+ struct cached_fids *cfids;
struct cached_fid *cfid, *q;
- struct list_head entry;
+ LIST_HEAD(entry);
- while (!kthread_should_stop()) {
- ssleep(1);
- INIT_LIST_HEAD(&entry);
- if (kthread_should_stop())
- return 0;
- spin_lock(&cfids->cfid_list_lock);
- list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
- if (time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) {
- list_del(&cfid->entry);
- list_add(&cfid->entry, &entry);
- cfids->num_entries--;
- }
- }
- spin_unlock(&cfids->cfid_list_lock);
+ cfids = container_of(work, struct cached_fids, laundromat_work.work);
- list_for_each_entry_safe(cfid, q, &entry, entry) {
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
+ if (cfid->time &&
+ time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) {
cfid->on_list = false;
- list_del(&cfid->entry);
+ list_move(&cfid->entry, &entry);
+ cfids->num_entries--;
+ /* To prevent race with smb2_cached_lease_break() */
+ kref_get(&cfid->refcount);
+ }
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+ list_for_each_entry_safe(cfid, q, &entry, entry) {
+ list_del(&cfid->entry);
+ /*
+ * Cancel and wait for the work to finish in case we are racing
+ * with it.
+ */
+ cancel_work_sync(&cfid->lease_break);
+ if (cfid->has_lease) {
/*
- * Cancel, and wait for the work to finish in
- * case we are racing with it.
+ * Our lease has not yet been cancelled from the server
+ * so we need to drop the reference.
*/
- cancel_work_sync(&cfid->lease_break);
- if (cfid->has_lease) {
- /*
- * We lease has not yet been cancelled from
- * the server so we need to drop the reference.
- */
- spin_lock(&cfids->cfid_list_lock);
- cfid->has_lease = false;
- spin_unlock(&cfids->cfid_list_lock);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- }
+ spin_lock(&cfids->cfid_list_lock);
+ cfid->has_lease = false;
+ spin_unlock(&cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
+ /* Drop the extra reference opened above */
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
-
- return 0;
+ queue_delayed_work(cifsiod_wq, &cfids->laundromat_work,
+ dir_cache_timeout * HZ);
}
-
struct cached_fids *init_cached_dirs(void)
{
struct cached_fids *cfids;
@@ -626,19 +637,10 @@ struct cached_fids *init_cached_dirs(void)
spin_lock_init(&cfids->cfid_list_lock);
INIT_LIST_HEAD(&cfids->entries);
- /*
- * since we're in a cifs function already, we know that
- * this will succeed. No need for try_module_get().
- */
- __module_get(THIS_MODULE);
- cfids->laundromat = kthread_run(cifs_cfids_laundromat_thread,
- cfids, "cifsd-cfid-laundromat");
- if (IS_ERR(cfids->laundromat)) {
- cifs_dbg(VFS, "Failed to start cfids laundromat thread.\n");
- kfree(cfids);
- module_put(THIS_MODULE);
- return NULL;
- }
+ INIT_DELAYED_WORK(&cfids->laundromat_work, cfids_laundromat_worker);
+ queue_delayed_work(cifsiod_wq, &cfids->laundromat_work,
+ dir_cache_timeout * HZ);
+
return cfids;
}
@@ -651,11 +653,10 @@ void free_cached_dirs(struct cached_fids *cfids)
struct cached_fid *cfid, *q;
LIST_HEAD(entry);
- if (cfids->laundromat) {
- kthread_stop(cfids->laundromat);
- cfids->laundromat = NULL;
- module_put(THIS_MODULE);
- }
+ if (cfids == NULL)
+ return;
+
+ cancel_delayed_work_sync(&cfids->laundromat_work);
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h
index a82ff2cea789..81ba0fd5cc16 100644
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -57,7 +57,7 @@ struct cached_fids {
spinlock_t cfid_list_lock;
int num_entries;
struct list_head entries;
- struct task_struct *laundromat;
+ struct delayed_work laundromat_work;
};
extern struct cached_fids *init_cached_dirs(void);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 41daebd220ff..8ca3d7606bb4 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -127,7 +127,7 @@ extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
struct dentry *direntry, const char *symname);
#ifdef CONFIG_CIFS_XATTR
-extern const struct xattr_handler *cifs_xattr_handlers[];
+extern const struct xattr_handler * const cifs_xattr_handlers[];
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
#else
# define cifs_xattr_handlers NULL
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 032d8716f671..02082621d8e0 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1807,6 +1807,7 @@ static inline bool is_retryable_error(int error)
#define MID_RETRY_NEEDED 8 /* session closed while this request out */
#define MID_RESPONSE_MALFORMED 0x10
#define MID_SHUTDOWN 0x20
+#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */
/* Flags */
#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
@@ -1943,7 +1944,7 @@ require use of the stronger protocol */
* cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once
* ->can_cache_brlcks
* cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc
- * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc
+ * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc
* cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
* cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
* ->invalidHandle initiate_cifs_search
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 7d8035846680..0c37eefa18a5 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -512,7 +512,7 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses);
extern struct cifs_ses *sesInfoAlloc(void);
extern void sesInfoFree(struct cifs_ses *);
-extern struct cifs_tcon *tconInfoAlloc(void);
+extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled);
extern void tconInfoFree(struct cifs_tcon *);
extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 687754791bf0..7b923e36501b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -1882,7 +1882,8 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
}
}
- tcon = tconInfoAlloc();
+ /* no need to setup directory caching on IPC share, so pass in false */
+ tcon = tcon_info_alloc(false);
if (tcon == NULL)
return -ENOMEM;
@@ -2473,8 +2474,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
static struct cifs_tcon *
cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
{
- int rc, xid;
struct cifs_tcon *tcon;
+ bool nohandlecache;
+ int rc, xid;
tcon = cifs_find_tcon(ses, ctx);
if (tcon) {
@@ -2492,11 +2494,17 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
goto out_fail;
}
- tcon = tconInfoAlloc();
+ if (ses->server->dialect >= SMB20_PROT_ID &&
+ (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING))
+ nohandlecache = ctx->nohandlecache;
+ else
+ nohandlecache = true;
+ tcon = tcon_info_alloc(!nohandlecache);
if (tcon == NULL) {
rc = -ENOMEM;
goto out_fail;
}
+ tcon->nohandlecache = nohandlecache;
if (ctx->snapshot_time) {
if (ses->server->vals->protocol_id == 0) {
@@ -2658,10 +2666,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->nocase = ctx->nocase;
tcon->broken_sparse_sup = ctx->no_sparse;
tcon->max_cached_dirs = ctx->max_cached_dirs;
- if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)
- tcon->nohandlecache = ctx->nohandlecache;
- else
- tcon->nohandlecache = true;
tcon->nodelete = ctx->nodelete;
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
@@ -2891,9 +2895,9 @@ bind_socket(struct TCP_Server_Info *server)
if (server->srcaddr.ss_family != AF_UNSPEC) {
/* Bind to the specified local IP address */
struct socket *socket = server->ssocket;
- rc = socket->ops->bind(socket,
- (struct sockaddr *) &server->srcaddr,
- sizeof(server->srcaddr));
+ rc = kernel_bind(socket,
+ (struct sockaddr *) &server->srcaddr,
+ sizeof(server->srcaddr));
if (rc < 0) {
struct sockaddr_in *saddr4;
struct sockaddr_in6 *saddr6;
@@ -3042,8 +3046,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
socket->sk->sk_sndbuf,
socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
- rc = socket->ops->connect(socket, saddr, slen,
- server->noblockcnt ? O_NONBLOCK : 0);
+ rc = kernel_connect(socket, saddr, slen,
+ server->noblockcnt ? O_NONBLOCK : 0);
/*
* When mounting SMB root file systems, we do not want to block in
* connect. Otherwise bail out and then let cifs_reconnect() perform
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 2108b3b40ce9..cf17e3dd703e 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1085,7 +1085,8 @@ int cifs_close(struct inode *inode, struct file *file)
!test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
dclose) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode,
+ inode_set_ctime_current(inode));
}
spin_lock(&cinode->deferred_lock);
cifs_add_deferred_close(cfile, dclose);
@@ -2596,7 +2597,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
if ((bytes_written > 0) && (offset))
rc = 0;
else if (bytes_written < 0)
@@ -4647,11 +4648,13 @@ static void cifs_readahead(struct readahead_control *ractl)
static int cifs_readpage_worker(struct file *file, struct page *page,
loff_t *poffset)
{
+ struct inode *inode = file_inode(file);
+ struct timespec64 atime, mtime;
char *read_data;
int rc;
/* Is the page cached? */
- rc = cifs_readpage_from_fscache(file_inode(file), page);
+ rc = cifs_readpage_from_fscache(inode, page);
if (rc == 0)
goto read_complete;
@@ -4666,11 +4669,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
cifs_dbg(FYI, "Bytes read %d\n", rc);
/* we do not want atime to be less than mtime, it broke some apps */
- file_inode(file)->i_atime = current_time(file_inode(file));
- if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime)))
- file_inode(file)->i_atime = file_inode(file)->i_mtime;
- else
- file_inode(file)->i_atime = current_time(file_inode(file));
+ atime = inode_set_atime_to_ts(inode, current_time(inode));
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&atime, &mtime))
+ inode_set_atime_to_ts(inode, inode_get_mtime(inode));
if (PAGE_SIZE > rc)
memset(read_data + rc, 0, PAGE_SIZE - rc);
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index e45ce31bbda7..a3493da12ad1 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -1541,6 +1541,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_parse_mount_err:
kfree_sensitive(ctx->password);
+ ctx->password = NULL;
return -EINVAL;
}
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 84f3b09367d2..a3d73720914f 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -49,12 +49,12 @@ static inline
void cifs_fscache_fill_coherency(struct inode *inode,
struct cifs_fscache_inode_coherency_data *cd)
{
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
memset(cd, 0, sizeof(*cd));
- cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
- cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
+ cd->last_write_time_sec = cpu_to_le64(mtime.tv_sec);
+ cd->last_write_time_nsec = cpu_to_le32(mtime.tv_nsec);
cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec);
cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec);
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index de2dfbaae821..3abfe77bfa46 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -82,6 +82,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
{
struct cifs_fscache_inode_coherency_data cd;
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+ struct timespec64 mtime;
cifs_dbg(FYI, "%s: revalidating inode %llu\n",
__func__, cifs_i->uniqueid);
@@ -101,7 +102,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
/* revalidate if mtime or size have changed */
fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
- if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+ mtime = inode_get_mtime(inode);
+ if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
cifs_i->server_eof == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
__func__, cifs_i->uniqueid);
@@ -164,10 +166,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode);
/* we do not want atime to be less than mtime, it broke some apps */
if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0)
- inode->i_atime = fattr->cf_mtime;
+ inode_set_atime_to_ts(inode, fattr->cf_mtime);
else
- inode->i_atime = fattr->cf_atime;
- inode->i_mtime = fattr->cf_mtime;
+ inode_set_atime_to_ts(inode, fattr->cf_atime);
+ inode_set_mtime_to_ts(inode, fattr->cf_mtime);
inode_set_ctime_to_ts(inode, fattr->cf_ctime);
inode->i_rdev = fattr->cf_rdev;
cifs_nlink_fattr_to_inode(inode, fattr);
@@ -1816,7 +1818,7 @@ out_reval:
when needed */
inode_set_ctime_current(inode);
}
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
@@ -2131,7 +2133,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifsInode->time = 0;
inode_set_ctime_current(d_inode(direntry));
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
rmdir_exit:
free_dentry_path(page);
@@ -2337,9 +2339,6 @@ unlink_target:
/* force revalidate to go get info when needed */
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
- source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir,
- inode_set_ctime_current(target_dir));
-
cifs_rename_exit:
kfree(info_buf_source);
free_dentry_path(page2);
@@ -2680,7 +2679,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
}
cifsFileInfo_put(cfile);
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int cifs_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 366b755ca913..35b176457bbe 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -113,18 +113,22 @@ sesInfoFree(struct cifs_ses *buf_to_free)
}
struct cifs_tcon *
-tconInfoAlloc(void)
+tcon_info_alloc(bool dir_leases_enabled)
{
struct cifs_tcon *ret_buf;
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
- ret_buf->cfids = init_cached_dirs();
- if (!ret_buf->cfids) {
- kfree(ret_buf);
- return NULL;
+
+ if (dir_leases_enabled == true) {
+ ret_buf->cfids = init_cached_dirs();
+ if (!ret_buf->cfids) {
+ kfree(ret_buf);
+ return NULL;
+ }
}
+ /* else ret_buf->cfids is already set to NULL above */
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index b41e2e872b22..0b89f7008ac0 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -539,6 +539,9 @@ static int parse_create_response(struct cifs_open_info_data *data,
int rc = 0;
switch (rsp->hdr.Status) {
+ case STATUS_IO_REPARSE_TAG_NOT_HANDLED:
+ reparse_point = true;
+ break;
case STATUS_STOPPED_ON_SYMLINK:
rc = smb2_parse_symlink_response(cifs_sb, iov,
&data->symlink_target);
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 194799ddd382..1a90dd78b238 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -877,8 +877,6 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
"STATUS_IO_REPARSE_TAG_MISMATCH"},
{STATUS_IO_REPARSE_DATA_INVALID, -EIO,
"STATUS_IO_REPARSE_DATA_INVALID"},
- {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO,
- "STATUS_IO_REPARSE_TAG_NOT_HANDLED"},
{STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO,
"STATUS_REPARSE_POINT_NOT_RESOLVED"},
{STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO,
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index d9eda2e958b4..f4849a8ad40b 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -297,7 +297,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)",
credits->value, new_val);
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
spin_lock(&server->req_lock);
@@ -1161,7 +1161,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
/* Use a fudge factor of 256 bytes in case we collide
* with a different set_EAs command.
*/
- if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ if (CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 <
used_len + ea_name_len + ea_value_len + 1) {
rc = -ENOSPC;
@@ -1403,12 +1403,14 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
/* Creation time should not need to be updated on close */
if (file_inf.LastWriteTime)
- inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime);
+ inode_set_mtime_to_ts(inode,
+ cifs_NTtimeToUnix(file_inf.LastWriteTime));
if (file_inf.ChangeTime)
inode_set_ctime_to_ts(inode,
cifs_NTtimeToUnix(file_inf.ChangeTime));
if (file_inf.LastAccessTime)
- inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime);
+ inode_set_atime_to_ts(inode,
+ cifs_NTtimeToUnix(file_inf.LastAccessTime));
/*
* i_blocks is not related to (i_size / i_blksize),
@@ -4591,7 +4593,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (shdr->Command != SMB2_READ) {
cifs_server_dbg(VFS, "only big read responses are supported\n");
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
if (server->ops->is_session_expired &&
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 092b0087c9dc..c75a80bb6d9e 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -89,20 +89,26 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
struct TCP_Server_Info *server)
{
struct smb3_hdr_req *smb3_hdr;
+
shdr->ProtocolId = SMB2_PROTO_NUMBER;
shdr->StructureSize = cpu_to_le16(64);
shdr->Command = smb2_cmd;
- if (server->dialect >= SMB30_PROT_ID) {
- /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */
- smb3_hdr = (struct smb3_hdr_req *)shdr;
- /* if primary channel is not set yet, use default channel for chan sequence num */
- if (SERVER_IS_CHAN(server))
- smb3_hdr->ChannelSequence =
- cpu_to_le16(server->primary_server->channel_sequence_num);
- else
- smb3_hdr->ChannelSequence = cpu_to_le16(server->channel_sequence_num);
- }
+
if (server) {
+ /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */
+ if (server->dialect >= SMB30_PROT_ID) {
+ smb3_hdr = (struct smb3_hdr_req *)shdr;
+ /*
+ * if primary channel is not set yet, use default
+ * channel for chan sequence num
+ */
+ if (SERVER_IS_CHAN(server))
+ smb3_hdr->ChannelSequence =
+ cpu_to_le16(server->primary_server->channel_sequence_num);
+ else
+ smb3_hdr->ChannelSequence =
+ cpu_to_le16(server->channel_sequence_num);
+ }
spin_lock(&server->req_lock);
/* Request up to 10 credits but don't go over the limit. */
if (server->credits >= server->max_credits)
@@ -842,7 +848,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
iov[num].iov_base = create_posix_buf(mode);
if (mode == ACL_NO_MODE)
- cifs_dbg(FYI, "Invalid mode\n");
+ cifs_dbg(FYI, "%s: no mode\n", __func__);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_posix);
@@ -2234,7 +2240,7 @@ create_durable_v2_buf(struct cifs_open_parms *oparms)
* (most servers default to 120 seconds) and most clients default to 0.
* This can be overridden at mount ("handletimeout=") if the user wants
* a different persistent (or resilient) handle timeout for all opens
- * opens on a particular SMB3 mount.
+ * on a particular SMB3 mount.
*/
buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
@@ -2379,7 +2385,7 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp)
return 0;
}
-/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */
+/* See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */
static void setup_owner_group_sids(char *buf)
{
struct owner_group_sids *sids = (struct owner_group_sids *)buf;
@@ -3124,6 +3130,7 @@ void
SMB2_ioctl_free(struct smb_rqst *rqst)
{
int i;
+
if (rqst && rqst->rq_iov) {
cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
for (i = 1; i < rqst->rq_nvec; i++)
@@ -3871,7 +3878,7 @@ void smb2_reconnect_server(struct work_struct *work)
goto done;
/* allocate a dummy tcon struct used for reconnect */
- tcon = tconInfoAlloc();
+ tcon = tcon_info_alloc(false);
if (!tcon) {
resched = true;
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 2a2aec8c6112..94df9eec3d8d 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -1401,10 +1401,13 @@ create_conn:
server->smbd_conn = smbd_get_connection(
server, (struct sockaddr *) &server->dstaddr);
- if (server->smbd_conn)
+ if (server->smbd_conn) {
cifs_dbg(VFS, "RDMA transport re-established\n");
-
- return server->smbd_conn ? 0 : -ENOENT;
+ trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
+ return 0;
+ }
+ trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
+ return -ENOENT;
}
static void destroy_caches_and_workqueue(struct smbd_connection *info)
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index a7e4755bed0f..de199ec9f726 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -935,6 +935,8 @@ DEFINE_EVENT(smb3_connect_class, smb3_##name, \
TP_ARGS(hostname, conn_id, addr))
DEFINE_SMB3_CONNECT_EVENT(connect_done);
+DEFINE_SMB3_CONNECT_EVENT(smbd_connect_done);
+DEFINE_SMB3_CONNECT_EVENT(smbd_connect_err);
DECLARE_EVENT_CLASS(smb3_connect_err_class,
TP_PROTO(char *hostname, __u64 conn_id,
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 1b5d9794ed5b..14710afdc2a3 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -18,7 +18,7 @@
#include <linux/bvec.h>
#include <linux/highmem.h>
#include <linux/uaccess.h>
-#include <asm/processor.h>
+#include <linux/processor.h>
#include <linux/mempool.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
@@ -35,6 +35,8 @@
void
cifs_wake_up_task(struct mid_q_entry *mid)
{
+ if (mid->mid_state == MID_RESPONSE_RECEIVED)
+ mid->mid_state = MID_RESPONSE_READY;
wake_up_process(mid->callback_data);
}
@@ -87,7 +89,8 @@ static void __release_mid(struct kref *refcount)
struct TCP_Server_Info *server = midEntry->server;
if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
- midEntry->mid_state == MID_RESPONSE_RECEIVED &&
+ (midEntry->mid_state == MID_RESPONSE_RECEIVED ||
+ midEntry->mid_state == MID_RESPONSE_READY) &&
server->ops->handle_cancelled_mid)
server->ops->handle_cancelled_mid(midEntry, server);
@@ -737,7 +740,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
int error;
error = wait_event_state(server->response_q,
- midQ->mid_state != MID_REQUEST_SUBMITTED,
+ midQ->mid_state != MID_REQUEST_SUBMITTED &&
+ midQ->mid_state != MID_RESPONSE_RECEIVED,
(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
if (error < 0)
return -ERESTARTSYS;
@@ -890,7 +894,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
spin_lock(&server->mid_lock);
switch (mid->mid_state) {
- case MID_RESPONSE_RECEIVED:
+ case MID_RESPONSE_READY:
spin_unlock(&server->mid_lock);
return rc;
case MID_RETRY_NEEDED:
@@ -989,6 +993,9 @@ cifs_compound_callback(struct mid_q_entry *mid)
credits.instance = server->reconnect_instance;
add_credits(server, &credits, mid->optype);
+
+ if (mid->mid_state == MID_RESPONSE_RECEIVED)
+ mid->mid_state = MID_RESPONSE_READY;
}
static void
@@ -1209,7 +1216,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&server->mid_lock);
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
- if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ[i]->mid_state == MID_RESPONSE_RECEIVED) {
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
@@ -1230,7 +1238,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
if (!midQ[i]->resp_buf ||
- midQ[i]->mid_state != MID_RESPONSE_RECEIVED) {
+ midQ[i]->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_dbg(FYI, "Bad MID state?\n");
goto out;
@@ -1417,7 +1425,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc != 0) {
send_cancel(server, &rqst, midQ);
spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
spin_unlock(&server->mid_lock);
@@ -1434,7 +1443,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
}
if (!midQ->resp_buf || !out_buf ||
- midQ->mid_state != MID_RESPONSE_RECEIVED) {
+ midQ->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_server_dbg(VFS, "Bad MID state?\n");
goto out;
@@ -1558,14 +1567,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* Wait for a reply - allow signals to interrupt. */
rc = wait_event_interruptible(server->response_q,
- (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) ||
+ (!(midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED)) ||
((server->tcpStatus != CifsGood) &&
(server->tcpStatus != CifsNew)));
/* Were we interrupted by a signal ? */
spin_lock(&server->srv_lock);
if ((rc == -ERESTARTSYS) &&
- (midQ->mid_state == MID_REQUEST_SUBMITTED) &&
+ (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) &&
((server->tcpStatus == CifsGood) ||
(server->tcpStatus == CifsNew))) {
spin_unlock(&server->srv_lock);
@@ -1596,7 +1607,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
send_cancel(server, &rqst, midQ);
spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
spin_unlock(&server->mid_lock);
@@ -1616,7 +1628,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
/* rcvd frame is ok */
- if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
+ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_tcon_dbg(VFS, "Bad MID state?\n");
goto out;
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 4ad5531686d8..ac199160bce6 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -478,7 +478,7 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
.set = cifs_xattr_set,
};
-const struct xattr_handler *cifs_xattr_handlers[] = {
+const struct xattr_handler * const cifs_xattr_handlers[] = {
&cifs_user_xattr_handler,
&cifs_os2_xattr_handler,
&cifs_cifs_acl_xattr_handler,
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 0d990c2f33cd..4b38c3a285f6 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -84,6 +84,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
spin_lock_init(&conn->llist_lock);
INIT_LIST_HEAD(&conn->lock_list);
+ init_rwsem(&conn->session_lock);
+
down_write(&conn_list_lock);
list_add(&conn->conns_list, &conn_list);
up_write(&conn_list_lock);
@@ -197,6 +199,9 @@ int ksmbd_conn_write(struct ksmbd_work *work)
if (work->send_no_response)
return 0;
+ if (!work->iov_idx)
+ return -EINVAL;
+
ksmbd_conn_lock(conn);
sent = conn->transport->ops->writev(conn->transport, work->iov,
work->iov_cnt,
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index ab2583f030ce..3c005246a32e 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -50,6 +50,7 @@ struct ksmbd_conn {
struct nls_table *local_nls;
struct unicode_map *um;
struct list_head conns_list;
+ struct rw_semaphore session_lock;
/* smb session 1 per user */
struct xarray sessions;
unsigned long last_active;
diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c
index 408cddf2f094..d2c81a8a11dd 100644
--- a/fs/smb/server/mgmt/tree_connect.c
+++ b/fs/smb/server/mgmt/tree_connect.c
@@ -73,7 +73,10 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
tree_conn->user = sess->user;
tree_conn->share_conf = sc;
+ tree_conn->t_state = TREE_NEW;
status.tree_conn = tree_conn;
+ atomic_set(&tree_conn->refcount, 1);
+ init_waitqueue_head(&tree_conn->refcount_q);
ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn,
GFP_KERNEL));
@@ -93,14 +96,33 @@ out_error:
return status;
}
+void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon)
+{
+ /*
+ * Checking waitqueue to releasing tree connect on
+ * tree disconnect. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&tcon->refcount) &&
+ waitqueue_active(&tcon->refcount_q))
+ wake_up(&tcon->refcount_q);
+}
+
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn)
{
int ret;
+ write_lock(&sess->tree_conns_lock);
+ xa_erase(&sess->tree_conns, tree_conn->id);
+ write_unlock(&sess->tree_conns_lock);
+
+ if (!atomic_dec_and_test(&tree_conn->refcount))
+ wait_event(tree_conn->refcount_q,
+ atomic_read(&tree_conn->refcount) == 0);
+
ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id);
ksmbd_release_tree_conn_id(sess, tree_conn->id);
- xa_erase(&sess->tree_conns, tree_conn->id);
ksmbd_share_config_put(tree_conn->share_conf);
kfree(tree_conn);
return ret;
@@ -111,11 +133,15 @@ struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess,
{
struct ksmbd_tree_connect *tcon;
+ read_lock(&sess->tree_conns_lock);
tcon = xa_load(&sess->tree_conns, id);
if (tcon) {
- if (test_bit(TREE_CONN_EXPIRE, &tcon->status))
+ if (tcon->t_state != TREE_CONNECTED)
+ tcon = NULL;
+ else if (!atomic_inc_not_zero(&tcon->refcount))
tcon = NULL;
}
+ read_unlock(&sess->tree_conns_lock);
return tcon;
}
@@ -129,8 +155,18 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess)
if (!sess)
return -EINVAL;
- xa_for_each(&sess->tree_conns, id, tc)
+ xa_for_each(&sess->tree_conns, id, tc) {
+ write_lock(&sess->tree_conns_lock);
+ if (tc->t_state == TREE_DISCONNECTED) {
+ write_unlock(&sess->tree_conns_lock);
+ ret = -ENOENT;
+ continue;
+ }
+ tc->t_state = TREE_DISCONNECTED;
+ write_unlock(&sess->tree_conns_lock);
+
ret |= ksmbd_tree_conn_disconnect(sess, tc);
+ }
xa_destroy(&sess->tree_conns);
return ret;
}
diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h
index 562d647ad9fa..6377a70b811c 100644
--- a/fs/smb/server/mgmt/tree_connect.h
+++ b/fs/smb/server/mgmt/tree_connect.h
@@ -14,7 +14,11 @@ struct ksmbd_share_config;
struct ksmbd_user;
struct ksmbd_conn;
-#define TREE_CONN_EXPIRE 1
+enum {
+ TREE_NEW = 0,
+ TREE_CONNECTED,
+ TREE_DISCONNECTED
+};
struct ksmbd_tree_connect {
int id;
@@ -27,7 +31,9 @@ struct ksmbd_tree_connect {
int maximal_access;
bool posix_extensions;
- unsigned long status;
+ atomic_t refcount;
+ wait_queue_head_t refcount_q;
+ unsigned int t_state;
};
struct ksmbd_tree_conn_status {
@@ -46,6 +52,7 @@ struct ksmbd_session;
struct ksmbd_tree_conn_status
ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
const char *share_name);
+void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon);
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 8a5dcab05614..15f68ee05089 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -174,7 +174,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
unsigned long id;
struct ksmbd_session *sess;
- down_write(&sessions_table_lock);
+ down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
if (sess->state != SMB2_SESSION_VALID ||
time_after(jiffies,
@@ -185,7 +185,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
continue;
}
}
- up_write(&sessions_table_lock);
+ up_write(&conn->session_lock);
}
int ksmbd_session_register(struct ksmbd_conn *conn,
@@ -227,7 +227,9 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
}
}
}
+ up_write(&sessions_table_lock);
+ down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
unsigned long chann_id;
struct channel *chann;
@@ -244,7 +246,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
ksmbd_session_destroy(sess);
}
}
- up_write(&sessions_table_lock);
+ up_write(&conn->session_lock);
}
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
@@ -252,9 +254,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
{
struct ksmbd_session *sess;
+ down_read(&conn->session_lock);
sess = xa_load(&conn->sessions, id);
if (sess)
sess->last_active = jiffies;
+ up_read(&conn->session_lock);
return sess;
}
@@ -351,6 +355,7 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->ksmbd_chann_list);
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
+ rwlock_init(&sess->tree_conns_lock);
ret = __init_smb2_session(sess);
if (ret)
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index f99d475b28db..63cb08fffde8 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -60,6 +60,7 @@ struct ksmbd_session {
struct ksmbd_file_table file_table;
unsigned long last_active;
+ rwlock_t tree_conns_lock;
};
static inline int test_session_flag(struct ksmbd_session *sess, int bit)
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 5ab2f52f9b35..3079e607c5fe 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -115,8 +115,10 @@ static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn,
if (check_conn_state(work))
return SERVER_HANDLER_CONTINUE;
- if (ksmbd_verify_smb_message(work))
+ if (ksmbd_verify_smb_message(work)) {
+ conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
return SERVER_HANDLER_ABORT;
+ }
command = conn->ops->get_cmd_val(work);
*cmd = command;
@@ -239,6 +241,8 @@ static void __handle_ksmbd_work(struct ksmbd_work *work,
} while (is_chained == true);
send:
+ if (work->tcon)
+ ksmbd_tree_connect_put(work->tcon);
smb3_preauth_hash_rsp(work);
if (work->sess && work->sess->enc && work->encrypted &&
conn->ops->encrypt_resp) {
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index e881df1d10cb..23bd3d1209df 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -440,10 +440,8 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
validate_credit:
if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
- smb2_validate_credit_charge(work->conn, hdr)) {
- work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ smb2_validate_credit_charge(work->conn, hdr))
return 1;
- }
return 0;
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 749660110878..658209839729 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -231,11 +231,12 @@ void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err)
{
struct smb2_hdr *rsp_hdr;
- if (work->next_smb2_rcv_hdr_off)
- rsp_hdr = ksmbd_resp_buf_next(work);
- else
- rsp_hdr = smb2_get_msg(work->response_buf);
+ rsp_hdr = smb2_get_msg(work->response_buf);
rsp_hdr->Status = err;
+
+ work->iov_idx = 0;
+ work->iov_cnt = 0;
+ work->next_smb2_rcv_hdr_off = 0;
smb2_set_err_rsp(work);
}
@@ -1993,6 +1994,9 @@ int smb2_tree_connect(struct ksmbd_work *work)
if (conn->posix_ext_supported)
status.tree_conn->posix_extensions = true;
+ write_lock(&sess->tree_conns_lock);
+ status.tree_conn->t_state = TREE_CONNECTED;
+ write_unlock(&sess->tree_conns_lock);
rsp->StructureSize = cpu_to_le16(16);
out_err1:
rsp->Capabilities = 0;
@@ -2122,27 +2126,50 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
ksmbd_debug(SMB, "request\n");
+ if (!tcon) {
+ ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
+
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ err = -ENOENT;
+ goto err_out;
+ }
+
+ ksmbd_close_tree_conn_fds(work);
+
+ write_lock(&sess->tree_conns_lock);
+ if (tcon->t_state == TREE_DISCONNECTED) {
+ write_unlock(&sess->tree_conns_lock);
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ err = -ENOENT;
+ goto err_out;
+ }
+
+ WARN_ON_ONCE(atomic_dec_and_test(&tcon->refcount));
+ tcon->t_state = TREE_DISCONNECTED;
+ write_unlock(&sess->tree_conns_lock);
+
+ err = ksmbd_tree_conn_disconnect(sess, tcon);
+ if (err) {
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ goto err_out;
+ }
+
+ work->tcon = NULL;
+
rsp->StructureSize = cpu_to_le16(4);
err = ksmbd_iov_pin_rsp(work, rsp,
sizeof(struct smb2_tree_disconnect_rsp));
if (err) {
rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
- smb2_set_err_rsp(work);
- return err;
+ goto err_out;
}
- if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) {
- ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
+ return 0;
- rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
- smb2_set_err_rsp(work);
- return -ENOENT;
- }
+err_out:
+ smb2_set_err_rsp(work);
+ return err;
- ksmbd_close_tree_conn_fds(work);
- ksmbd_tree_conn_disconnect(sess, tcon);
- work->tcon = NULL;
- return 0;
}
/**
@@ -2164,17 +2191,17 @@ int smb2_session_logoff(struct ksmbd_work *work)
ksmbd_debug(SMB, "request\n");
- sess_id = le64_to_cpu(req->hdr.SessionId);
-
- rsp->StructureSize = cpu_to_le16(4);
- err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp));
- if (err) {
- rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+ ksmbd_conn_lock(conn);
+ if (!ksmbd_conn_good(conn)) {
+ ksmbd_conn_unlock(conn);
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
smb2_set_err_rsp(work);
- return err;
+ return -ENOENT;
}
-
+ sess_id = le64_to_cpu(req->hdr.SessionId);
ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT);
+ ksmbd_conn_unlock(conn);
+
ksmbd_close_session_fds(work);
ksmbd_conn_wait_idle(conn, sess_id);
@@ -2196,6 +2223,14 @@ int smb2_session_logoff(struct ksmbd_work *work)
ksmbd_free_user(sess->user);
sess->user = NULL;
ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE);
+
+ rsp->StructureSize = cpu_to_le16(4);
+ err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp));
+ if (err) {
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+ smb2_set_err_rsp(work);
+ return err;
+ }
return 0;
}
@@ -3370,8 +3405,10 @@ err_out:
}
ksmbd_revert_fsids(work);
err_out1:
- if (!rc)
+ if (!rc) {
+ ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED);
rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len);
+ }
if (rc) {
if (rc == -EINVAL)
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
@@ -4797,9 +4834,9 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode->i_atime);
+ time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
file_info->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_mtime);
+ time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
file_info->LastWriteTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
file_info->ChangeTime = cpu_to_le64(time);
@@ -5406,9 +5443,9 @@ int smb2_close(struct ksmbd_work *work)
rsp->EndOfFile = cpu_to_le64(inode->i_size);
rsp->Attributes = fp->f_ci->m_fattr;
rsp->CreationTime = cpu_to_le64(fp->create_time);
- time = ksmbd_UnixTimeToNT(inode->i_atime);
+ time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
rsp->LastAccessTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_mtime);
+ time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
rsp->LastWriteTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
rsp->ChangeTime = cpu_to_le64(time);
@@ -6115,12 +6152,12 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
memcpy(aux_payload_buf, rpc_resp->payload, rpc_resp->payload_sz);
nbytes = rpc_resp->payload_sz;
- kvfree(rpc_resp);
err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
offsetof(struct smb2_read_rsp, Buffer),
aux_payload_buf, nbytes);
if (err)
goto out;
+ kvfree(rpc_resp);
} else {
err = ksmbd_iov_pin_rsp(work, (void *)rsp,
offsetof(struct smb2_read_rsp, Buffer));
@@ -6312,7 +6349,7 @@ int smb2_read(struct ksmbd_work *work)
aux_payload_buf,
nbytes);
kvfree(aux_payload_buf);
-
+ aux_payload_buf = NULL;
nbytes = 0;
if (remain_bytes < 0) {
err = (int)remain_bytes;
@@ -7028,10 +7065,6 @@ skip:
ksmbd_debug(SMB,
"would have to wait for getting lock\n");
- spin_lock(&work->conn->llist_lock);
- list_add_tail(&smb_lock->clist,
- &work->conn->lock_list);
- spin_unlock(&work->conn->llist_lock);
list_add(&smb_lock->llist, &rollback_list);
argv = kmalloc(sizeof(void *), GFP_KERNEL);
@@ -7062,9 +7095,6 @@ skip:
if (work->state != KSMBD_WORK_ACTIVE) {
list_del(&smb_lock->llist);
- spin_lock(&work->conn->llist_lock);
- list_del(&smb_lock->clist);
- spin_unlock(&work->conn->llist_lock);
locks_free_lock(flock);
if (work->state == KSMBD_WORK_CANCELLED) {
@@ -7084,19 +7114,16 @@ skip:
}
list_del(&smb_lock->llist);
- spin_lock(&work->conn->llist_lock);
- list_del(&smb_lock->clist);
- spin_unlock(&work->conn->llist_lock);
release_async_work(work);
goto retry;
} else if (!rc) {
+ list_add(&smb_lock->llist, &rollback_list);
spin_lock(&work->conn->llist_lock);
list_add_tail(&smb_lock->clist,
&work->conn->lock_list);
list_add_tail(&smb_lock->flist,
&fp->lock_list);
spin_unlock(&work->conn->llist_lock);
- list_add(&smb_lock->llist, &rollback_list);
ksmbd_debug(SMB, "successful in taking lock\n");
} else {
goto out;
@@ -8036,10 +8063,10 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
goto err_out;
}
- opinfo_put(opinfo);
- ksmbd_fd_put(work, fp);
opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
+ opinfo_put(opinfo);
+ ksmbd_fd_put(work, fp);
rsp->StructureSize = cpu_to_le16(24);
rsp->OplockLevel = rsp_oplevel;
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index e5e438bf5499..6c0305be895e 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -1420,7 +1420,6 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
out:
posix_acl_release(fattr.cf_acls);
posix_acl_release(fattr.cf_dacls);
- mark_inode_dirty(inode);
return rc;
}
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index f41f8d6108ce..c91eac6514dd 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -106,7 +106,7 @@ int ksmbd_query_inode_status(struct inode *inode)
ci = __ksmbd_inode_lookup(inode);
if (ci) {
ret = KSMBD_INODE_STATUS_OK;
- if (ci->m_flags & S_DEL_PENDING)
+ if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS))
ret = KSMBD_INODE_STATUS_PENDING_DELETE;
atomic_dec(&ci->m_count);
}
@@ -116,7 +116,7 @@ int ksmbd_query_inode_status(struct inode *inode)
bool ksmbd_inode_pending_delete(struct ksmbd_file *fp)
{
- return (fp->f_ci->m_flags & S_DEL_PENDING);
+ return (fp->f_ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS));
}
void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp)
@@ -333,6 +333,9 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
static struct ksmbd_file *ksmbd_fp_get(struct ksmbd_file *fp)
{
+ if (fp->f_state != FP_INITED)
+ return NULL;
+
if (!atomic_inc_not_zero(&fp->refcount))
return NULL;
return fp;
@@ -382,15 +385,20 @@ int ksmbd_close_fd(struct ksmbd_work *work, u64 id)
return 0;
ft = &work->sess->file_table;
- read_lock(&ft->lock);
+ write_lock(&ft->lock);
fp = idr_find(ft->idr, id);
if (fp) {
set_close_state_blocked_works(fp);
- if (!atomic_dec_and_test(&fp->refcount))
+ if (fp->f_state != FP_INITED)
fp = NULL;
+ else {
+ fp->f_state = FP_CLOSED;
+ if (!atomic_dec_and_test(&fp->refcount))
+ fp = NULL;
+ }
}
- read_unlock(&ft->lock);
+ write_unlock(&ft->lock);
if (!fp)
return -EINVAL;
@@ -570,6 +578,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp)
fp->tcon = work->tcon;
fp->volatile_id = KSMBD_NO_FID;
fp->persistent_id = KSMBD_NO_FID;
+ fp->f_state = FP_NEW;
fp->f_ci = ksmbd_inode_get(fp);
if (!fp->f_ci) {
@@ -591,6 +600,17 @@ err_out:
return ERR_PTR(ret);
}
+void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
+ unsigned int state)
+{
+ if (!fp)
+ return;
+
+ write_lock(&ft->lock);
+ fp->f_state = state;
+ write_unlock(&ft->lock);
+}
+
static int
__close_file_table_ids(struct ksmbd_file_table *ft,
struct ksmbd_tree_connect *tcon,
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index fcb13413fa8d..03d0bf941216 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -60,6 +60,12 @@ struct ksmbd_inode {
__le32 m_fattr;
};
+enum {
+ FP_NEW = 0,
+ FP_INITED,
+ FP_CLOSED
+};
+
struct ksmbd_file {
struct file *filp;
u64 persistent_id;
@@ -98,6 +104,7 @@ struct ksmbd_file {
/* if ls is happening on directory, below is valid*/
struct ksmbd_readdir_data readdir_data;
int dot_dotdot[2];
+ unsigned int f_state;
};
static inline void set_ctx_actor(struct dir_context *ctx,
@@ -142,6 +149,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode);
int ksmbd_init_global_file_table(void);
void ksmbd_free_global_file_table(void);
void ksmbd_set_fd_limit(unsigned long limit);
+void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
+ unsigned int state);
/*
* INODE hash
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index c6e626b00546..aa3411354e66 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid);
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
- inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
- inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
- inode_set_ctime(inode, inode->i_mtime.tv_sec, 0);
+ inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
+ inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
+ inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
inode->i_mode = le16_to_cpu(sqsh_ino->mode);
inode->i_size = 0;
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index a6164fdf9435..5a756e6790b5 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -111,4 +111,4 @@ extern const struct address_space_operations squashfs_symlink_aops;
extern const struct inode_operations squashfs_symlink_inode_ops;
/* xattr.c */
-extern const struct xattr_handler *squashfs_xattr_handlers[];
+extern const struct xattr_handler * const squashfs_xattr_handlers[];
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e1e3f3dd5a06..ce6608cabd49 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -262,7 +262,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int type)
}
}
-const struct xattr_handler *squashfs_xattr_handlers[] = {
+const struct xattr_handler * const squashfs_xattr_handlers[] = {
&squashfs_xattr_user_handler,
&squashfs_xattr_trusted_handler,
&squashfs_xattr_security_handler,
diff --git a/fs/stack.c b/fs/stack.c
index b5e01bdb5f5f..f18920119944 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -66,8 +66,8 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
dest->i_uid = src->i_uid;
dest->i_gid = src->i_gid;
dest->i_rdev = src->i_rdev;
- dest->i_atime = src->i_atime;
- dest->i_mtime = src->i_mtime;
+ inode_set_atime_to_ts(dest, inode_get_atime(src));
+ inode_set_mtime_to_ts(dest, inode_get_mtime(src));
inode_set_ctime_to_ts(dest, inode_get_ctime(src));
dest->i_blkbits = src->i_blkbits;
dest->i_flags = src->i_flags;
diff --git a/fs/stat.c b/fs/stat.c
index 6822ac77aec2..24bb0209e459 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -27,37 +27,6 @@
#include "mount.h"
/**
- * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
- * @stat: where to store the resulting values
- * @request_mask: STATX_* values requested
- * @inode: inode from which to grab the c/mtime
- *
- * Given @inode, grab the ctime and mtime out if it and store the result
- * in @stat. When fetching the value, flag it as queried so the next write
- * will use a fine-grained timestamp.
- */
-void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
-{
- atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec;
-
- /* If neither time was requested, then don't report them */
- if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
- stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
- return;
- }
-
- stat->mtime = inode->i_mtime;
- stat->ctime.tv_sec = inode->__i_ctime.tv_sec;
- /*
- * Atomically set the QUERIED flag and fetch the new value with
- * the flag masked off.
- */
- stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) &
- ~I_CTIME_QUERIED;
-}
-EXPORT_SYMBOL(fill_mg_cmtime);
-
-/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @idmap: idmap of the mount the inode was found from
* @request_mask: statx request_mask
@@ -88,15 +57,9 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
- stat->atime = inode->i_atime;
-
- if (is_mgtime(inode)) {
- fill_mg_cmtime(stat, request_mask, inode);
- } else {
- stat->mtime = inode->i_mtime;
- stat->ctime = inode_get_ctime(inode);
- }
-
+ stat->atime = inode_get_atime(inode);
+ stat->mtime = inode_get_mtime(inode);
+ stat->ctime = inode_get_ctime(inode);
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
@@ -419,12 +382,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
#ifdef __ARCH_WANT_NEW_STAT
-#if BITS_PER_LONG == 32
-# define choose_32_64(a,b) a
-#else
-# define choose_32_64(a,b) b
-#endif
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
diff --git a/fs/super.c b/fs/super.c
index 2d762ce67f6e..c7b452e12e4c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1419,32 +1419,48 @@ EXPORT_SYMBOL(sget_dev);
#ifdef CONFIG_BLOCK
/*
- * Lock a super block that the callers holds a reference to.
+ * Lock the superblock that is holder of the bdev. Returns the superblock
+ * pointer if we successfully locked the superblock and it is alive. Otherwise
+ * we return NULL and just unlock bdev->bd_holder_lock.
*
- * The caller needs to ensure that the super_block isn't being freed while
- * calling this function, e.g. by holding a lock over the call to this function
- * and the place that clears the pointer to the superblock used by this function
- * before freeing the superblock.
+ * The function must be called with bdev->bd_holder_lock and releases it.
*/
-static bool super_lock_shared_active(struct super_block *sb)
+static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock)
{
- bool born = super_lock_shared(sb);
+ struct super_block *sb = bdev->bd_holder;
+ bool born;
+
+ lockdep_assert_held(&bdev->bd_holder_lock);
+ lockdep_assert_not_held(&sb->s_umount);
+ lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
+
+ /* Make sure sb doesn't go away from under us */
+ spin_lock(&sb_lock);
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ mutex_unlock(&bdev->bd_holder_lock);
+ born = super_lock_shared(sb);
if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
super_unlock_shared(sb);
- return false;
+ put_super(sb);
+ return NULL;
}
- return true;
+ /*
+ * The superblock is active and we hold s_umount, we can drop our
+ * temporary reference now.
+ */
+ put_super(sb);
+ return sb;
}
static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
- struct super_block *sb = bdev->bd_holder;
-
- /* bd_holder_lock ensures that the sb isn't freed */
- lockdep_assert_held(&bdev->bd_holder_lock);
+ struct super_block *sb;
- if (!super_lock_shared_active(sb))
+ sb = bdev_super_lock_shared(bdev);
+ if (!sb)
return;
if (!surprise)
@@ -1459,11 +1475,10 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
static void fs_bdev_sync(struct block_device *bdev)
{
- struct super_block *sb = bdev->bd_holder;
-
- lockdep_assert_held(&bdev->bd_holder_lock);
+ struct super_block *sb;
- if (!super_lock_shared_active(sb))
+ sb = bdev_super_lock_shared(bdev);
+ if (!sb)
return;
sync_filesystem(sb);
super_unlock_shared(sb);
@@ -1479,14 +1494,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
struct fs_context *fc)
{
blk_mode_t mode = sb_open_mode(sb_flags);
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
- bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
- if (IS_ERR(bdev)) {
+ bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+ if (IS_ERR(bdev_handle)) {
if (fc)
errorf(fc, "%s: Can't open blockdev", fc->source);
- return PTR_ERR(bdev);
+ return PTR_ERR(bdev_handle);
}
+ bdev = bdev_handle->bdev;
/*
* This really should be in blkdev_get_by_dev, but right now can't due
@@ -1494,7 +1511,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
* writable from userspace even for a read-only block device.
*/
if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, sb);
+ bdev_release(bdev_handle);
return -EACCES;
}
@@ -1510,10 +1527,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (fc)
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- blkdev_put(bdev, sb);
+ bdev_release(bdev_handle);
return -EBUSY;
}
spin_lock(&sb_lock);
+ sb->s_bdev_handle = bdev_handle;
sb->s_bdev = bdev;
sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
if (bdev_stable_writes(bdev))
@@ -1646,7 +1664,7 @@ void kill_block_super(struct super_block *sb)
generic_shutdown_super(sb);
if (bdev) {
sync_blockdev(bdev);
- blkdev_put(bdev, sb);
+ bdev_release(sb->s_bdev_handle);
}
}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 2f5ead88d00b..2e126d72d619 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -224,7 +224,7 @@ got_it:
memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = sysv_handle_dirsync(dir);
out_page:
@@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
}
de->inode = 0;
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return sysv_handle_dirsync(inode);
}
@@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
}
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
return sysv_handle_dirsync(inode);
}
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 6719da5889d9..269df6d49815 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
dirty_sb(sb);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = fs16_to_cpu(sbi, ino);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_blocks = 0;
memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
SYSV_I(inode)->i_dir_start_lookup = 0;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 0aa3827d8178..5a915b2e68f5 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -200,11 +200,9 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
- inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
- inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
+ inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0);
+ inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0);
inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
- inode->i_atime.tv_nsec = 0;
- inode->i_mtime.tv_nsec = 0;
inode->i_blocks = 0;
si = SYSV_I(inode);
@@ -253,9 +251,9 @@ static int __sysv_write_inode(struct inode *inode, int wait)
raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
- raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
- raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
- raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec);
+ raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode));
+ raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode));
+ raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode));
si = SYSV_I(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index edb94e55de8e..725981474e5f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -423,7 +423,7 @@ do_indirects:
}
n++;
}
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (IS_SYNC(inode))
sysv_sync_inode (inode);
else
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 237c6f370ad9..8c8d64e76103 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -70,6 +70,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags);
static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
+static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
static int eventfs_release(struct inode *inode, struct file *file);
static const struct inode_operations eventfs_root_dir_inode_operations = {
@@ -79,7 +80,7 @@ static const struct inode_operations eventfs_root_dir_inode_operations = {
static const struct file_operations eventfs_file_operations = {
.open = dcache_dir_open_wrapper,
.read = generic_read_dir,
- .iterate_shared = dcache_readdir,
+ .iterate_shared = dcache_readdir_wrapper,
.llseek = generic_file_llseek,
.release = eventfs_release,
};
@@ -185,17 +186,49 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
/**
* eventfs_set_ef_status_free - set the ef->status to free
+ * @ti: the tracefs_inode of the dentry
* @dentry: dentry who's status to be freed
*
* eventfs_set_ef_status_free will be called if no more
* references remain
*/
-void eventfs_set_ef_status_free(struct dentry *dentry)
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
{
struct tracefs_inode *ti_parent;
- struct eventfs_file *ef;
+ struct eventfs_inode *ei;
+ struct eventfs_file *ef, *tmp;
+
+ /* The top level events directory may be freed by this */
+ if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
+ LIST_HEAD(ef_del_list);
+
+ mutex_lock(&eventfs_mutex);
+
+ ei = ti->private;
+
+ /* Record all the top level files */
+ list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ lockdep_is_held(&eventfs_mutex)) {
+ list_add_tail(&ef->del_list, &ef_del_list);
+ }
+
+ /* Nothing should access this, but just in case! */
+ ti->private = NULL;
+
+ mutex_unlock(&eventfs_mutex);
+
+ /* Now safely free the top level files and their children */
+ list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
+ list_del(&ef->del_list);
+ eventfs_remove(ef);
+ }
+
+ kfree(ei);
+ return;
+ }
mutex_lock(&eventfs_mutex);
+
ti_parent = get_tracefs(dentry->d_parent->d_inode);
if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
goto out;
@@ -364,6 +397,11 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
return ret;
}
+struct dentry_list {
+ void *cursor;
+ struct dentry **dentries;
+};
+
/**
* eventfs_release - called to release eventfs file/dir
* @inode: inode to be released
@@ -372,26 +410,25 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
static int eventfs_release(struct inode *inode, struct file *file)
{
struct tracefs_inode *ti;
- struct eventfs_inode *ei;
- struct eventfs_file *ef;
- struct dentry *dentry;
- int idx;
+ struct dentry_list *dlist = file->private_data;
+ void *cursor;
+ int i;
ti = get_tracefs(inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return -EINVAL;
- ei = ti->private;
- idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_srcu(ef, &ei->e_top_files, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- mutex_lock(&eventfs_mutex);
- dentry = ef->dentry;
- mutex_unlock(&eventfs_mutex);
- if (dentry)
- dput(dentry);
+ if (WARN_ON_ONCE(!dlist))
+ return -EINVAL;
+
+ for (i = 0; dlist->dentries && dlist->dentries[i]; i++) {
+ dput(dlist->dentries[i]);
}
- srcu_read_unlock(&eventfs_srcu, idx);
+
+ cursor = dlist->cursor;
+ kfree(dlist->dentries);
+ kfree(dlist);
+ file->private_data = cursor;
return dcache_dir_close(inode, file);
}
@@ -410,21 +447,70 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
struct tracefs_inode *ti;
struct eventfs_inode *ei;
struct eventfs_file *ef;
+ struct dentry_list *dlist;
+ struct dentry **dentries = NULL;
struct dentry *dentry = file_dentry(file);
+ struct dentry *d;
struct inode *f_inode = file_inode(file);
+ int cnt = 0;
int idx;
+ int ret;
ti = get_tracefs(f_inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return -EINVAL;
+ if (WARN_ON_ONCE(file->private_data))
+ return -EINVAL;
+
+ dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
+ if (!dlist)
+ return -ENOMEM;
+
ei = ti->private;
idx = srcu_read_lock(&eventfs_srcu);
- list_for_each_entry_rcu(ef, &ei->e_top_files, list) {
- create_dentry(ef, dentry, false);
+ list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+ d = create_dentry(ef, dentry, false);
+ if (d) {
+ struct dentry **tmp;
+
+ tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
+ if (!tmp)
+ break;
+ tmp[cnt] = d;
+ tmp[cnt + 1] = NULL;
+ cnt++;
+ dentries = tmp;
+ }
}
srcu_read_unlock(&eventfs_srcu, idx);
- return dcache_dir_open(inode, file);
+ ret = dcache_dir_open(inode, file);
+
+ /*
+ * dcache_dir_open() sets file->private_data to a dentry cursor.
+ * Need to save that but also save all the dentries that were
+ * opened by this function.
+ */
+ dlist->cursor = file->private_data;
+ dlist->dentries = dentries;
+ file->private_data = dlist;
+ return ret;
+}
+
+/*
+ * This just sets the file->private_data back to the cursor and back.
+ */
+static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
+{
+ struct dentry_list *dlist = file->private_data;
+ int ret;
+
+ file->private_data = dlist->cursor;
+ ret = dcache_readdir(file, ctx);
+ dlist->cursor = file->private_data;
+ file->private_data = dlist;
+ return ret;
}
/**
@@ -491,6 +577,9 @@ struct dentry *eventfs_create_events_dir(const char *name,
struct tracefs_inode *ti;
struct inode *inode;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (IS_ERR(dentry))
return dentry;
@@ -507,7 +596,7 @@ struct dentry *eventfs_create_events_dir(const char *name,
INIT_LIST_HEAD(&ei->e_top_files);
ti = get_tracefs(inode);
- ti->flags |= TRACEFS_EVENT_INODE;
+ ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
ti->private = ei;
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
@@ -538,6 +627,9 @@ struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
struct eventfs_inode *ei_parent;
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!parent)
return ERR_PTR(-EINVAL);
@@ -569,6 +661,9 @@ struct eventfs_file *eventfs_add_dir(const char *name,
{
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!ef_parent)
return ERR_PTR(-EINVAL);
@@ -606,6 +701,9 @@ int eventfs_add_events_file(const char *name, umode_t mode,
struct eventfs_inode *ei;
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return -ENODEV;
+
if (!parent)
return -EINVAL;
@@ -654,6 +752,9 @@ int eventfs_add_file(const char *name, umode_t mode,
{
struct eventfs_file *ef;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return -ENODEV;
+
if (!ef_parent)
return -EINVAL;
@@ -791,7 +892,6 @@ void eventfs_remove(struct eventfs_file *ef)
void eventfs_remove_events_dir(struct dentry *dentry)
{
struct tracefs_inode *ti;
- struct eventfs_inode *ei;
if (!dentry || !dentry->d_inode)
return;
@@ -800,8 +900,6 @@ void eventfs_remove_events_dir(struct dentry *dentry)
if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
return;
- ei = ti->private;
d_invalidate(dentry);
dput(dentry);
- kfree(ei);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index de5b72216b1a..429603d865a9 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -152,7 +152,7 @@ struct inode *tracefs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
}
return inode;
}
@@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
ti = get_tracefs(inode);
if (ti && ti->flags & TRACEFS_EVENT_INODE)
- eventfs_set_ef_status_free(dentry);
+ eventfs_set_ef_status_free(ti, dentry);
iput(inode);
}
@@ -673,6 +673,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
*/
struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
{
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
return __create_dir(name, parent, &simple_dir_inode_operations);
}
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 69c2b1d87c46..4f2e49e2197b 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -3,7 +3,8 @@
#define _TRACEFS_INTERNAL_H
enum {
- TRACEFS_EVENT_INODE = BIT(1),
+ TRACEFS_EVENT_INODE = BIT(1),
+ TRACEFS_EVENT_TOP_INODE = BIT(2),
};
struct tracefs_inode {
@@ -24,6 +25,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
struct dentry *eventfs_failed_creating(struct dentry *dentry);
struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct dentry *dentry);
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 3125e76376ee..921f9033d0d2 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,8 +88,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
}
const struct fscrypt_operations ubifs_crypt_operations = {
- .flags = FS_CFLG_OWN_PAGES,
- .key_prefix = "ubifs:",
+ .legacy_key_prefix = "ubifs:",
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index eef9e527d9ff..d013c5b3f1ed 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -237,14 +237,14 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode));
pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode));
pr_err("\tatime %u.%u\n",
- (unsigned int)inode->i_atime.tv_sec,
- (unsigned int)inode->i_atime.tv_nsec);
+ (unsigned int) inode_get_atime_sec(inode),
+ (unsigned int) inode_get_atime_nsec(inode));
pr_err("\tmtime %u.%u\n",
- (unsigned int)inode->i_mtime.tv_sec,
- (unsigned int)inode->i_mtime.tv_nsec);
+ (unsigned int) inode_get_mtime_sec(inode),
+ (unsigned int) inode_get_mtime_nsec(inode));
pr_err("\tctime %u.%u\n",
- (unsigned int) inode_get_ctime(inode).tv_sec,
- (unsigned int) inode_get_ctime(inode).tv_nsec);
+ (unsigned int) inode_get_ctime_sec(inode),
+ (unsigned int) inode_get_ctime_nsec(inode));
pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum);
pr_err("\txattr_size %u\n", ui->xattr_size);
pr_err("\txattr_cnt %u\n", ui->xattr_cnt);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 2f48c58d47cd..7af442de44c3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -96,7 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
inode->i_flags |= S_NOCMTIME;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_mapping->nrpages = 0;
if (!is_xattr) {
@@ -324,7 +324,8 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -767,7 +768,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
inode_set_ctime_current(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -841,7 +843,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -944,7 +947,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -1018,7 +1022,8 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inc_nlink(dir);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err) {
ubifs_err(c, "cannot create directory, error %d", err);
@@ -1109,7 +1114,8 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -1209,7 +1215,8 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5382f0b2587..2e65fd2dbdc3 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1088,9 +1088,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
if (attr->ia_valid & ATTR_GID)
inode->i_gid = attr->ia_gid;
if (attr->ia_valid & ATTR_ATIME)
- inode->i_atime = attr->ia_atime;
+ inode_set_atime_to_ts(inode, attr->ia_atime);
if (attr->ia_valid & ATTR_MTIME)
- inode->i_mtime = attr->ia_mtime;
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
if (attr->ia_valid & ATTR_CTIME)
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (attr->ia_valid & ATTR_MODE) {
@@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
/* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
/* 'truncate_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1365,9 +1365,9 @@ static inline int mctime_update_needed(const struct inode *inode,
const struct timespec64 *now)
{
struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
- if (!timespec64_equal(&inode->i_mtime, now) ||
- !timespec64_equal(&ctime, now))
+ if (!timespec64_equal(&mtime, now) || !timespec64_equal(&ctime, now))
return 1;
return 0;
}
@@ -1429,7 +1429,7 @@ static int update_mctime(struct inode *inode)
return err;
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index ffc9beee7be6..d69d2154645b 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -452,12 +452,12 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
ino->ch.node_type = UBIFS_INO_NODE;
ino_key_init_flash(c, &ino->key, inode->i_ino);
ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
- ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
- ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ino->ctime_sec = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
- ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
- ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ino->atime_sec = cpu_to_le64(inode_get_atime_sec(inode));
+ ino->atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode));
+ ino->ctime_sec = cpu_to_le64(inode_get_ctime_sec(inode));
+ ino->ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
+ ino->mtime_sec = cpu_to_le64(inode_get_mtime_sec(inode));
+ ino->mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
ino->uid = cpu_to_le32(i_uid_read(inode));
ino->gid = cpu_to_le32(i_gid_read(inode));
ino->mode = cpu_to_le32(inode->i_mode);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b08fb28d16b5..366941d4a18a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -142,10 +142,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
set_nlink(inode, le32_to_cpu(ino->nlink));
i_uid_write(inode, le32_to_cpu(ino->uid));
i_gid_write(inode, le32_to_cpu(ino->gid));
- inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
- inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
- inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
- inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+ inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec),
+ le32_to_cpu(ino->atime_nsec));
+ inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec),
+ le32_to_cpu(ino->mtime_nsec));
inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec),
le32_to_cpu(ino->ctime_nsec));
inode->i_mode = le32_to_cpu(ino->mode);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ebb3ad6b5e7e..62633816d7d0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2043,7 +2043,7 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
size_t size);
#ifdef CONFIG_UBIFS_FS_XATTR
-extern const struct xattr_handler *ubifs_xattr_handlers[];
+extern const struct xattr_handler * const ubifs_xattr_handlers[];
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum);
int ubifs_purge_xattrs(struct inode *host);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 406c82eab513..0847db521984 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -735,7 +735,7 @@ static const struct xattr_handler ubifs_security_xattr_handler = {
};
#endif
-const struct xattr_handler *ubifs_xattr_handlers[] = {
+const struct xattr_handler * const ubifs_xattr_handlers[] = {
&ubifs_user_xattr_handler,
&ubifs_trusted_xattr_handler,
#ifdef CONFIG_UBIFS_FS_SECURITY
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6b558cbbeb6b..5f1f969f4134 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -100,8 +100,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
- iinfo->i_crtime = inode->i_mtime;
+ simple_inode_init_ts(inode);
+ iinfo->i_crtime = inode_get_mtime(inode);
if (unlikely(insert_inode_locked(inode) < 0)) {
make_bad_inode(inode);
iput(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a17a6184cc39..d8493449d4c5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1296,7 +1296,7 @@ set_size:
goto out_unlock;
}
update_time:
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
if (IS_SYNC(inode))
udf_sync_inode(inode);
else
@@ -1327,7 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
int bs = inode->i_sb->s_blocksize;
int ret = -EIO;
uint32_t uid, gid;
- struct timespec64 ctime;
+ struct timespec64 ts;
reread:
if (iloc->partitionReferenceNum >= sbi->s_partitions) {
@@ -1504,10 +1504,12 @@ reread:
inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
(inode->i_sb->s_blocksize_bits - 9);
- udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime);
- udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime);
- udf_disk_stamp_to_time(&ctime, fe->attrTime);
- inode_set_ctime_to_ts(inode, ctime);
+ udf_disk_stamp_to_time(&ts, fe->accessTime);
+ inode_set_atime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, fe->modificationTime);
+ inode_set_mtime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, fe->attrTime);
+ inode_set_ctime_to_ts(inode, ts);
iinfo->i_unique = le64_to_cpu(fe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1519,11 +1521,13 @@ reread:
inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
(inode->i_sb->s_blocksize_bits - 9);
- udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime);
- udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime);
+ udf_disk_stamp_to_time(&ts, efe->accessTime);
+ inode_set_atime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, efe->modificationTime);
+ inode_set_mtime_to_ts(inode, ts);
+ udf_disk_stamp_to_time(&ts, efe->attrTime);
+ inode_set_ctime_to_ts(inode, ts);
udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);
- udf_disk_stamp_to_time(&ctime, efe->attrTime);
- inode_set_ctime_to_ts(inode, ctime);
iinfo->i_unique = le64_to_cpu(efe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1798,8 +1802,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
inode->i_sb->s_blocksize - sizeof(struct fileEntry));
fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
- udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
- udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
+ udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode));
+ udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode));
udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode));
memset(&(fe->impIdent), 0, sizeof(struct regid));
strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
@@ -1829,12 +1833,14 @@ static int udf_update_inode(struct inode *inode, int do_sync)
cpu_to_le32(inode->i_sb->s_blocksize);
}
- udf_adjust_time(iinfo, inode->i_atime);
- udf_adjust_time(iinfo, inode->i_mtime);
+ udf_adjust_time(iinfo, inode_get_atime(inode));
+ udf_adjust_time(iinfo, inode_get_mtime(inode));
udf_adjust_time(iinfo, inode_get_ctime(inode));
- udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
- udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
+ udf_time_to_disk_stamp(&efe->accessTime,
+ inode_get_atime(inode));
+ udf_time_to_disk_stamp(&efe->modificationTime,
+ inode_get_mtime(inode));
udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode));
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ae55ab8859b6..3508ac484da3 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
*(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
udf_fiiter_write_fi(&iter, NULL);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, false, 1);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, true, 1);
inc_nlink(dir);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
d_instantiate_new(dentry, inode);
@@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_size = 0;
inode_dec_link_count(dir);
udf_add_fid_counter(dir->i_sb, true, -1);
- dir->i_mtime = inode_set_ctime_to_ts(dir,
- inode_set_ctime_current(inode));
+ inode_set_mtime_to_ts(dir,
+ inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
mark_inode_dirty(dir);
ret = 0;
end_rmdir:
@@ -555,7 +555,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
set_nlink(inode, 1);
}
udf_fiiter_delete_entry(&iter);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
inode_dec_link_count(inode);
udf_add_fid_counter(dir->i_sb, false, -1);
@@ -748,7 +748,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
udf_add_fid_counter(dir->i_sb, false, 1);
inode_set_ctime_current(inode);
mark_inode_dirty(inode);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
ihold(inode);
d_instantiate(dentry, inode);
@@ -866,8 +866,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode),
-1);
}
- old_dir->i_mtime = inode_set_ctime_current(old_dir);
- new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
+ inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
mark_inode_dirty(old_dir);
mark_inode_dirty(new_dir);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index fd57f03b6c93..27c85d92d1dc 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
ufs_commit_chunk(page, pos, len);
ufs_put_page(page);
if (update_times)
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
ufs_handle_dirsync(dir);
}
@@ -397,7 +397,7 @@ got_it:
ufs_set_de_type(sb, de, inode->i_mode);
ufs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = ufs_handle_dirsync(dir);
@@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
ufs_commit_chunk(page, pos, to - from);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
err = ufs_handle_dirsync(inode);
out:
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a1e7bd9d1f98..73531827ecee 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -292,7 +292,7 @@ cg_found:
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_blocks = 0;
inode->i_generation = 0;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
ufsi->i_flags = UFS_I(dir)->i_flags;
ufsi->i_lastfrag = 0;
ufsi->i_shadow = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 21a4779a2de5..338e4b97312f 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -579,13 +579,15 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode));
inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
- inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
+ inode_set_atime(inode,
+ (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec),
+ 0);
inode_set_ctime(inode,
(signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec),
0);
- inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
+ inode_set_mtime(inode,
+ (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec),
+ 0);
inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
@@ -626,12 +628,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid));
inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
- inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
+ inode_set_atime(inode, fs64_to_cpu(sb, ufs2_inode->ui_atime),
+ fs32_to_cpu(sb, ufs2_inode->ui_atimensec));
inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime),
fs32_to_cpu(sb, ufs2_inode->ui_ctimensec));
- inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
- inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
- inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
+ inode_set_mtime(inode, fs64_to_cpu(sb, ufs2_inode->ui_mtime),
+ fs32_to_cpu(sb, ufs2_inode->ui_mtimensec));
inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
@@ -725,12 +727,14 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
- ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
+ ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb,
+ inode_get_atime_sec(inode));
ufs_inode->ui_atime.tv_usec = 0;
ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb,
- inode_get_ctime(inode).tv_sec);
+ inode_get_ctime_sec(inode));
ufs_inode->ui_ctime.tv_usec = 0;
- ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
+ ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb,
+ inode_get_mtime_sec(inode));
ufs_inode->ui_mtime.tv_usec = 0;
ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks);
ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -770,13 +774,15 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode));
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
- ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
- ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
- ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec);
+ ufs_inode->ui_atime = cpu_to_fs64(sb, inode_get_atime_sec(inode));
+ ufs_inode->ui_atimensec = cpu_to_fs32(sb,
+ inode_get_atime_nsec(inode));
+ ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime_sec(inode));
ufs_inode->ui_ctimensec = cpu_to_fs32(sb,
- inode_get_ctime(inode).tv_nsec);
- ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
- ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
+ inode_get_ctime_nsec(inode));
+ ufs_inode->ui_mtime = cpu_to_fs64(sb, inode_get_mtime_sec(inode));
+ ufs_inode->ui_mtimensec = cpu_to_fs32(sb,
+ inode_get_mtime_nsec(inode));
ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks);
ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags);
@@ -1208,7 +1214,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
truncate_setsize(inode, size);
ufs_truncate_blocks(inode);
- inode->i_mtime = inode_set_ctime_current(inode);
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
out:
UFSD("EXIT: err %d\n", err);
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 83f20dd15522..72ac9320e6a3 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -126,12 +126,12 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
do_div(allocated, 512);
inode->i_blocks = allocated;
- inode->i_atime = ns_to_timespec64(
- info->access_time.ns_relative_to_unix_epoch);
+ inode_set_atime_to_ts(inode,
+ ns_to_timespec64(info->access_time.ns_relative_to_unix_epoch));
inode_set_ctime_to_ts(inode,
ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch));
- inode->i_mtime = ns_to_timespec64(
- info->modification_time.ns_relative_to_unix_epoch);
+ inode_set_mtime_to_ts(inode,
+ ns_to_timespec64(info->modification_time.ns_relative_to_unix_epoch));
return 0;
}
@@ -194,7 +194,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
struct vboxsf_sbi *sbi;
struct vboxsf_inode *sf_i;
struct shfl_fsobjinfo info;
- struct timespec64 prev_mtime;
+ struct timespec64 mtime, prev_mtime;
struct inode *inode;
int err;
@@ -202,7 +202,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
return -EINVAL;
inode = d_inode(dentry);
- prev_mtime = inode->i_mtime;
+ prev_mtime = inode_get_mtime(inode);
sf_i = VBOXSF_I(inode);
sbi = VBOXSF_SBI(dentry->d_sb);
if (!sf_i->force_restat) {
@@ -225,7 +225,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
* page-cache for it. Note this also gets triggered by our own writes,
* this is unavoidable.
*/
- if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0)
+ mtime = inode_get_mtime(inode);
+ if (timespec64_compare(&mtime, &prev_mtime) > 0)
invalidate_inode_pages2(inode->i_mapping);
return 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index efd4736bc94b..09d927603433 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -56,7 +56,7 @@ strcmp_prefix(const char *a, const char *a_prefix)
static const struct xattr_handler *
xattr_resolve_name(struct inode *inode, const char **name)
{
- const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+ const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
const struct xattr_handler *handler;
if (!(inode->i_opflags & IOP_XATTR)) {
@@ -162,7 +162,7 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
int
xattr_supports_user_prefix(struct inode *inode)
{
- const struct xattr_handler **handlers = inode->i_sb->s_xattr;
+ const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
const struct xattr_handler *handler;
if (!(inode->i_opflags & IOP_XATTR)) {
@@ -999,7 +999,7 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+ const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
ssize_t remaining_size = buffer_size;
int err = 0;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index c9d653168ad0..ed0bc8cbc703 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select FS_DEBUG
+ select XFS_DEBUG
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index e9cc481b4ddf..f9f4d694640d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -1001,6 +1001,12 @@ xfs_ag_shrink_space(
error = -ENOSPC;
goto resv_init_out;
}
+
+ /* Update perag geometry */
+ pag->block_count -= delta;
+ __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+
xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
return 0;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index a35781577cad..543f3748c2a3 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -220,8 +220,10 @@ xfs_inode_from_disk(
* a time before epoch is converted to a time long after epoch
* on 64 bit systems.
*/
- inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
- inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+ inode_set_atime_to_ts(inode,
+ xfs_inode_from_disk_ts(from, from->di_atime));
+ inode_set_mtime_to_ts(inode,
+ xfs_inode_from_disk_ts(from, from->di_mtime));
inode_set_ctime_to_ts(inode,
xfs_inode_from_disk_ts(from, from->di_ctime));
@@ -315,8 +317,8 @@ xfs_inode_to_disk(
to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
- to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
- to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+ to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode));
+ to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode));
to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode));
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 2420865f3007..a5100a11faf9 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log);
#define xlog_check_buf_cancel_table(log) do { } while (0)
#endif
+/*
+ * Transform a regular reservation into one suitable for recovery of a log
+ * intent item.
+ *
+ * Intent recovery only runs a single step of the transaction chain and defers
+ * the rest to a separate transaction. Therefore, we reduce logcount to 1 here
+ * to avoid livelocks if the log grant space is nearly exhausted due to the
+ * recovered intent pinning the tail. Keep the same logflags to avoid tripping
+ * asserts elsewhere. Struct copies abound below.
+ */
+static inline struct xfs_trans_res
+xlog_recover_resv(const struct xfs_trans_res *r)
+{
+ struct xfs_trans_res ret = {
+ .tr_logres = r->tr_logres,
+ .tr_logcount = 1,
+ .tr_logflags = r->tr_logflags,
+ };
+
+ return ret;
+}
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index fa180ab66b73..396648acb5be 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -970,6 +970,7 @@ xfs_rtfree_extent(
xfs_mount_t *mp; /* file system mount structure */
xfs_fsblock_t sb; /* summary file block number */
struct xfs_buf *sumbp = NULL; /* summary file block buffer */
+ struct timespec64 atime;
mp = tp->t_mountp;
@@ -999,7 +1000,10 @@ xfs_rtfree_extent(
mp->m_sb.sb_rextents) {
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
- *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
+
+ atime = inode_get_atime(VFS_I(mp->m_rbmip));
+ *((uint64_t *)&atime) = 0;
+ inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
return 0;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5e174685a77c..6264daaab37b 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -266,7 +266,8 @@ xfs_validate_sb_write(
return -EFSCORRUPTED;
}
- if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ if (!xfs_is_readonly(mp) &&
+ xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
xfs_alert(mp,
"Corruption detected in superblock read-only compatible features (0x%x)!",
(sbp->sb_features_ro_compat &
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index ad22656376d3..70e97ea6eee7 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- /* If the mtime changes, then ctime must also change */
- ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = current_time(inode);
- tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
- inode->i_mtime = tv;
+ inode_set_mtime_to_ts(inode, tv);
+ if (flags & XFS_ICHGTIME_CHG)
+ inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 7d3aa14d81b5..4849efcaa33a 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -588,6 +588,8 @@ out_nofix:
out_teardown:
error = xchk_teardown(sc, error);
out_sc:
+ if (error != -ENOENT)
+ xchk_stats_merge(mp, sm, &run);
kfree(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
@@ -595,8 +597,6 @@ out:
sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
error = 0;
}
- if (error != -ENOENT)
- xchk_stats_merge(mp, sm, &run);
return error;
need_drain:
error = xchk_teardown(sc, 0);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index aeb92624176b..cd91db4a5548 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -185,7 +185,10 @@ xchk_stats_merge_one(
{
struct xchk_scrub_stats *css;
- ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR);
+ if (sm->sm_type >= XFS_SCRUB_TYPE_NR) {
+ ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR);
+ return;
+ }
css = &cs->cs_stats[sm->sm_type];
spin_lock(&css->css_lock);
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index d98e8e77c684..090c3ead43fd 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -10,7 +10,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_format.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/scrub.h"
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 5db87b34fb6e..89c7a9f4f930 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -333,7 +333,6 @@ xfs_attr_inactive(
int error = 0;
mp = dp->i_mount;
- ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, lock_mode);
if (!xfs_inode_has_attr_fork(dp))
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 2788a6f2edcd..36fe2abb16e6 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -547,7 +547,7 @@ xfs_attri_item_recover(
struct xfs_inode *ip;
struct xfs_da_args *args;
struct xfs_trans *tp;
- struct xfs_trans_res tres;
+ struct xfs_trans_res resv;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
int error;
@@ -618,8 +618,9 @@ xfs_attri_item_recover(
goto out;
}
- xfs_init_attr_trans(args, &tres, &total);
- error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ xfs_init_attr_trans(args, &resv, &total);
+ resv = xlog_recover_resv(&resv);
+ error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
goto out;
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 7551c3ec4ea5..e736a0844c89 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -490,6 +490,7 @@ xfs_bui_item_recover(
struct list_head *capture_list)
{
struct xfs_bmap_intent fake = { };
+ struct xfs_trans_res resv;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
@@ -515,7 +516,8 @@ xfs_bui_item_recover(
return error;
/* Allocate transaction and do the work. */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv,
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
goto err_rele;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fcefab687285..40e0a1f1f753 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1644,7 +1644,7 @@ xfs_swap_extents(
uint64_t f;
int resblks = 0;
unsigned int flags = 0;
- struct timespec64 ctime;
+ struct timespec64 ctime, mtime;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1758,10 +1758,11 @@ xfs_swap_extents(
* under it.
*/
ctime = inode_get_ctime(VFS_I(ip));
+ mtime = inode_get_mtime(VFS_I(ip));
if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
(sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
- (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
- (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+ (sbp->bs_mtime.tv_sec != mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) {
error = -EBUSY;
goto out_trans_cancel;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c1ece4a08ff4..003e157241da 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1945,8 +1945,6 @@ void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
- struct block_device *bdev = btp->bt_bdev;
-
unregister_shrinker(&btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
@@ -1954,8 +1952,8 @@ xfs_free_buftarg(
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
- if (bdev != btp->bt_mount->m_super->s_bdev)
- blkdev_put(bdev, btp->bt_mount->m_super);
+ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
+ bdev_release(btp->bt_bdev_handle);
kmem_free(btp);
}
@@ -1990,16 +1988,15 @@ xfs_setsize_buftarg(
*/
STATIC int
xfs_setsize_buftarg_early(
- xfs_buftarg_t *btp,
- struct block_device *bdev)
+ xfs_buftarg_t *btp)
{
- return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
+ return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
}
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev)
+ struct bdev_handle *bdev_handle)
{
xfs_buftarg_t *btp;
const struct dax_holder_operations *ops = NULL;
@@ -2010,9 +2007,10 @@ xfs_alloc_buftarg(
btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
- btp->bt_dev = bdev->bd_dev;
- btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+ btp->bt_bdev_handle = bdev_handle;
+ btp->bt_dev = bdev_handle->bdev->bd_dev;
+ btp->bt_bdev = bdev_handle->bdev;
+ btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
/*
@@ -2022,7 +2020,7 @@ xfs_alloc_buftarg(
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
- if (xfs_setsize_buftarg_early(btp, bdev))
+ if (xfs_setsize_buftarg_early(btp))
goto error_free;
if (list_lru_init(&btp->bt_lru))
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index df8f47953bb4..ada9d310b7d3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -98,6 +98,7 @@ typedef unsigned int xfs_buf_flags_t;
*/
typedef struct xfs_buftarg {
dev_t bt_dev;
+ struct bdev_handle *bt_bdev_handle;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
u64 bt_dax_part_off;
@@ -364,7 +365,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
- struct block_device *bdev);
+ struct bdev_handle *bdev_handle);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index afc4c78b9eed..d5787991bb5b 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2010, 2023 Red Hat, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
@@ -19,21 +19,147 @@
#include "xfs_log.h"
#include "xfs_ag.h"
-STATIC int
-xfs_trim_extents(
+/*
+ * Notes on an efficient, low latency fstrim algorithm
+ *
+ * We need to walk the filesystem free space and issue discards on the free
+ * space that meet the search criteria (size and location). We cannot issue
+ * discards on extents that might be in use, or are so recently in use they are
+ * still marked as busy. To serialise against extent state changes whilst we are
+ * gathering extents to trim, we must hold the AGF lock to lock out other
+ * allocations and extent free operations that might change extent state.
+ *
+ * However, we cannot just hold the AGF for the entire AG free space walk whilst
+ * we issue discards on each free space that is found. Storage devices can have
+ * extremely slow discard implementations (e.g. ceph RBD) and so walking a
+ * couple of million free extents and issuing synchronous discards on each
+ * extent can take a *long* time. Whilst we are doing this walk, nothing else
+ * can access the AGF, and we can stall transactions and hence the log whilst
+ * modifications wait for the AGF lock to be released. This can lead hung tasks
+ * kicking the hung task timer and rebooting the system. This is bad.
+ *
+ * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
+ * lock, gathers a range of inode cluster buffers that are allocated, drops the
+ * AGI lock and then reads all the inode cluster buffers and processes them. It
+ * loops doing this, using a cursor to keep track of where it is up to in the AG
+ * for each iteration to restart the INOBT lookup from.
+ *
+ * We can't do this exactly with free space - once we drop the AGF lock, the
+ * state of the free extent is out of our control and we cannot run a discard
+ * safely on it in this situation. Unless, of course, we've marked the free
+ * extent as busy and undergoing a discard operation whilst we held the AGF
+ * locked.
+ *
+ * This is exactly how online discard works - free extents are marked busy when
+ * they are freed, and once the extent free has been committed to the journal,
+ * the busy extent record is marked as "undergoing discard" and the discard is
+ * then issued on the free extent. Once the discard completes, the busy extent
+ * record is removed and the extent is able to be allocated again.
+ *
+ * In the context of fstrim, if we find a free extent we need to discard, we
+ * don't have to discard it immediately. All we need to do it record that free
+ * extent as being busy and under discard, and all the allocation routines will
+ * now avoid trying to allocate it. Hence if we mark the extent as busy under
+ * the AGF lock, we can safely discard it without holding the AGF lock because
+ * nothing will attempt to allocate that free space until the discard completes.
+ *
+ * This also allows us to issue discards asynchronously like we do with online
+ * discard, and so for fast devices fstrim will run much faster as we can have
+ * multiple discard operations in flight at once, as well as pipeline the free
+ * extent search so that it overlaps in flight discard IO.
+ */
+
+struct workqueue_struct *xfs_discard_wq;
+
+static void
+xfs_discard_endio_work(
+ struct work_struct *work)
+{
+ struct xfs_busy_extents *extents =
+ container_of(work, struct xfs_busy_extents, endio_work);
+
+ xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
+ kmem_free(extents->owner);
+}
+
+/*
+ * Queue up the actual completion to a thread to avoid IRQ-safe locking for
+ * pagb_lock.
+ */
+static void
+xfs_discard_endio(
+ struct bio *bio)
+{
+ struct xfs_busy_extents *extents = bio->bi_private;
+
+ INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
+ queue_work(xfs_discard_wq, &extents->endio_work);
+ bio_put(bio);
+}
+
+/*
+ * Walk the discard list and issue discards on all the busy extents in the
+ * list. We plug and chain the bios so that we only need a single completion
+ * call to clear all the busy extents once the discards are complete.
+ */
+int
+xfs_discard_extents(
+ struct xfs_mount *mp,
+ struct xfs_busy_extents *extents)
+{
+ struct xfs_extent_busy *busyp;
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int error = 0;
+
+ blk_start_plug(&plug);
+ list_for_each_entry(busyp, &extents->extent_list, list) {
+ trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+ busyp->length);
+
+ error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+ XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+ XFS_FSB_TO_BB(mp, busyp->length),
+ GFP_NOFS, &bio);
+ if (error && error != -EOPNOTSUPP) {
+ xfs_info(mp,
+ "discard failed for extent [0x%llx,%u], error %d",
+ (unsigned long long)busyp->bno,
+ busyp->length,
+ error);
+ break;
+ }
+ }
+
+ if (bio) {
+ bio->bi_private = extents;
+ bio->bi_end_io = xfs_discard_endio;
+ submit_bio(bio);
+ } else {
+ xfs_discard_endio_work(&extents->endio_work);
+ }
+ blk_finish_plug(&plug);
+
+ return error;
+}
+
+
+static int
+xfs_trim_gather_extents(
struct xfs_perag *pag,
xfs_daddr_t start,
xfs_daddr_t end,
xfs_daddr_t minlen,
+ struct xfs_alloc_rec_incore *tcur,
+ struct xfs_busy_extents *extents,
uint64_t *blocks_trimmed)
{
struct xfs_mount *mp = pag->pag_mount;
- struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
- struct xfs_agf *agf;
int error;
int i;
+ int batch = 100;
/*
* Force out the log. This means any transactions that might have freed
@@ -45,20 +171,28 @@ xfs_trim_extents(
error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
if (error)
return error;
- agf = agbp->b_addr;
cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
/*
- * Look up the longest btree in the AGF and start with it.
+ * Look up the extent length requested in the AGF and start with it.
*/
- error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i);
+ if (tcur->ar_startblock == NULLAGBLOCK)
+ error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i);
+ else
+ error = xfs_alloc_lookup_le(cur, tcur->ar_startblock,
+ tcur->ar_blockcount, &i);
if (error)
goto out_del_cursor;
+ if (i == 0) {
+ /* nothing of that length left in the AG, we are done */
+ tcur->ar_blockcount = 0;
+ goto out_del_cursor;
+ }
/*
* Loop until we are done with all extents that are large
- * enough to be worth discarding.
+ * enough to be worth discarding or we hit batch limits.
*/
while (i) {
xfs_agblock_t fbno;
@@ -73,7 +207,16 @@ xfs_trim_extents(
error = -EFSCORRUPTED;
break;
}
- ASSERT(flen <= be32_to_cpu(agf->agf_longest));
+
+ if (--batch <= 0) {
+ /*
+ * Update the cursor to point at this extent so we
+ * restart the next batch from this extent.
+ */
+ tcur->ar_startblock = fbno;
+ tcur->ar_blockcount = flen;
+ break;
+ }
/*
* use daddr format for all range/len calculations as that is
@@ -88,6 +231,7 @@ xfs_trim_extents(
*/
if (dlen < minlen) {
trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
+ tcur->ar_blockcount = 0;
break;
}
@@ -110,29 +254,103 @@ xfs_trim_extents(
goto next_extent;
}
- trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen);
- error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
- if (error)
- break;
+ xfs_extent_busy_insert_discard(pag, fbno, flen,
+ &extents->extent_list);
*blocks_trimmed += flen;
-
next_extent:
error = xfs_btree_decrement(cur, 0, &i);
if (error)
break;
- if (fatal_signal_pending(current)) {
- error = -ERESTARTSYS;
- break;
- }
+ /*
+ * If there's no more records in the tree, we are done. Set the
+ * cursor block count to 0 to indicate to the caller that there
+ * is no more extents to search.
+ */
+ if (i == 0)
+ tcur->ar_blockcount = 0;
}
+ /*
+ * If there was an error, release all the gathered busy extents because
+ * we aren't going to issue a discard on them any more.
+ */
+ if (error)
+ xfs_extent_busy_clear(mp, &extents->extent_list, false);
out_del_cursor:
xfs_btree_del_cursor(cur, error);
xfs_buf_relse(agbp);
return error;
}
+static bool
+xfs_trim_should_stop(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
+/*
+ * Iterate the free list gathering extents and discarding them. We need a cursor
+ * for the repeated iteration of gather/discard loop, so use the longest extent
+ * we found in the last batch as the key to start the next.
+ */
+static int
+xfs_trim_extents(
+ struct xfs_perag *pag,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_daddr_t minlen,
+ uint64_t *blocks_trimmed)
+{
+ struct xfs_alloc_rec_incore tcur = {
+ .ar_blockcount = pag->pagf_longest,
+ .ar_startblock = NULLAGBLOCK,
+ };
+ int error = 0;
+
+ do {
+ struct xfs_busy_extents *extents;
+
+ extents = kzalloc(sizeof(*extents), GFP_KERNEL);
+ if (!extents) {
+ error = -ENOMEM;
+ break;
+ }
+
+ extents->mount = pag->pag_mount;
+ extents->owner = extents;
+ INIT_LIST_HEAD(&extents->extent_list);
+
+ error = xfs_trim_gather_extents(pag, start, end, minlen,
+ &tcur, extents, blocks_trimmed);
+ if (error) {
+ kfree(extents);
+ break;
+ }
+
+ /*
+ * We hand the extent list to the discard function here so the
+ * discarded extents can be removed from the busy extent list.
+ * This allows the discards to run asynchronously with gathering
+ * the next round of extents to discard.
+ *
+ * However, we must ensure that we do not reference the extent
+ * list after this function call, as it may have been freed by
+ * the time control returns to us.
+ */
+ error = xfs_discard_extents(pag->pag_mount, extents);
+ if (error)
+ break;
+
+ if (xfs_trim_should_stop())
+ break;
+
+ } while (tcur.ar_blockcount != 0);
+
+ return error;
+
+}
+
/*
* trim a range of the filesystem.
*
@@ -195,12 +413,12 @@ xfs_ioc_trim(
for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
error = xfs_trim_extents(pag, start, end, minlen,
&blocks_trimmed);
- if (error) {
+ if (error)
last_error = error;
- if (error == -ERESTARTSYS) {
- xfs_perag_rele(pag);
- break;
- }
+
+ if (xfs_trim_should_stop()) {
+ xfs_perag_rele(pag);
+ break;
}
}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
index de92d9cc958f..2b1a85223a56 100644
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@@ -3,8 +3,10 @@
#define XFS_DISCARD_H 1
struct fstrim_range;
-struct list_head;
+struct xfs_mount;
+struct xfs_busy_extents;
-extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy);
+int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim);
#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1064c2342876..7cd09c3a82cb 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -146,6 +146,20 @@ xfs_nfs_get_inode(
return ERR_PTR(error);
}
+ /*
+ * Reload the incore unlinked list to avoid failure in inodegc.
+ * Use an unlocked check here because unrecovered unlinked inodes
+ * should be somewhat rare.
+ */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked(ip);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_irele(ip);
+ return ERR_PTR(error);
+ }
+ }
+
if (VFS_I(ip)->i_generation != generation) {
xfs_irele(ip);
return ERR_PTR(-ESTALE);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 7c2fdc71e42d..9ecfdcdc752f 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -19,13 +19,13 @@
#include "xfs_log.h"
#include "xfs_ag.h"
-void
-xfs_extent_busy_insert(
- struct xfs_trans *tp,
+static void
+xfs_extent_busy_insert_list(
struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
- unsigned int flags)
+ unsigned int flags,
+ struct list_head *busy_list)
{
struct xfs_extent_busy *new;
struct xfs_extent_busy *busyp;
@@ -40,7 +40,7 @@ xfs_extent_busy_insert(
new->flags = flags;
/* trace before insert to be able to see failed inserts */
- trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len);
+ trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
spin_lock(&pag->pagb_lock);
rbp = &pag->pagb_tree.rb_node;
@@ -62,10 +62,33 @@ xfs_extent_busy_insert(
rb_link_node(&new->rb_node, parent, rbp);
rb_insert_color(&new->rb_node, &pag->pagb_tree);
- list_add(&new->list, &tp->t_busy);
+ /* always process discard lists in fifo order */
+ list_add_tail(&new->list, busy_list);
spin_unlock(&pag->pagb_lock);
}
+void
+xfs_extent_busy_insert(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ unsigned int flags)
+{
+ xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
+}
+
+void
+xfs_extent_busy_insert_discard(
+ struct xfs_perag *pag,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct list_head *busy_list)
+{
+ xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
+ busy_list);
+}
+
/*
* Search for a busy extent within the range of the extent we are about to
* allocate. You need to be holding the busy extent tree lock when calling
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index c37bf87e6781..0639aab336f3 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -16,9 +16,6 @@ struct xfs_alloc_arg;
/*
* Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
* have been freed but whose transactions aren't committed to disk yet.
- *
- * Note that we use the transaction ID to record the transaction, not the
- * transaction structure itself. See xfs_extent_busy_insert() for details.
*/
struct xfs_extent_busy {
struct rb_node rb_node; /* ag by-bno indexed search tree */
@@ -31,11 +28,32 @@ struct xfs_extent_busy {
#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */
};
+/*
+ * List used to track groups of related busy extents all the way through
+ * to discard completion.
+ */
+struct xfs_busy_extents {
+ struct xfs_mount *mount;
+ struct list_head extent_list;
+ struct work_struct endio_work;
+
+ /*
+ * Owner is the object containing the struct xfs_busy_extents to free
+ * once the busy extents have been processed. If only the
+ * xfs_busy_extents object needs freeing, then point this at itself.
+ */
+ void *owner;
+};
+
void
xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag,
xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
void
+xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno,
+ xfs_extlen_t len, struct list_head *busy_list);
+
+void
xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
bool do_discard);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f1a5ecf099aa..3fa8789820ad 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -660,6 +660,7 @@ xfs_efi_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
@@ -683,7 +684,8 @@ xfs_efi_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
if (error)
return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 10403ba9b58f..736e5545f584 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -565,6 +565,19 @@ err:
}
#endif /* CONFIG_XFS_RT */
+static inline bool
+rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r)
+{
+ if (!xfs_has_reflink(mp))
+ return true;
+ if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner))
+ return true;
+ if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN))
+ return true;
+ return false;
+}
+
/* Execute a getfsmap query against the regular data device. */
STATIC int
__xfs_getfsmap_datadev(
@@ -598,7 +611,6 @@ __xfs_getfsmap_datadev(
* low to the fsmap low key and max out the high key to the end
* of the AG.
*/
- info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
if (error)
@@ -608,12 +620,9 @@ __xfs_getfsmap_datadev(
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {
- /* empty */
- } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) ||
- (info->low.rm_flags & (XFS_RMAP_ATTR_FORK |
- XFS_RMAP_BMBT_BLOCK |
- XFS_RMAP_UNWRITTEN))) {
- info->low.rm_startblock += info->low.rm_blockcount;
+ /* No previous record from which to continue */
+ } else if (rmap_not_shareable(mp, &info->low)) {
+ /* Last record seen was an unshareable extent */
info->low.rm_owner = 0;
info->low.rm_offset = 0;
@@ -621,8 +630,10 @@ __xfs_getfsmap_datadev(
if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs)
return 0;
} else {
+ /* Last record seen was a shareable file data extent */
info->low.rm_offset += info->low.rm_blockcount;
}
+ info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
info->high.rm_startblock = -1U;
info->high.rm_owner = ULLONG_MAX;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e541f5c0bc25..3c210ac83713 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -113,7 +113,7 @@ xfs_inode_alloc(
INIT_LIST_HEAD(&ip->i_ioend_list);
spin_lock_init(&ip->i_ioend_lock);
ip->i_next_unlinked = NULLAGINO;
- ip->i_prev_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = 0;
return ip;
}
@@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
int cpu;
bool ret = false;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
@@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
int error = 0;
flush_workqueue(mp->m_inodegc_wq);
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
struct xfs_inodegc *gc;
gc = per_cpu_ptr(mp->m_inodegc, cpu);
@@ -1845,9 +1845,17 @@ xfs_inodegc_worker(
struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
+ struct xfs_mount *mp = gc->mp;
unsigned int nofs_flag;
- ASSERT(gc->cpu == smp_processor_id());
+ /*
+ * Clear the cpu mask bit and ensure that we have seen the latest
+ * update of the gc structure associated with this CPU. This matches
+ * with the release semantics used when setting the cpumask bit in
+ * xfs_inodegc_queue.
+ */
+ cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
+ smp_mb__after_atomic();
WRITE_ONCE(gc->items, 0);
@@ -1862,7 +1870,7 @@ xfs_inodegc_worker(
nofs_flag = memalloc_nofs_save();
ip = llist_entry(node, struct xfs_inode, i_gclist);
- trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+ trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
WRITE_ONCE(gc->shrinker_hits, 0);
llist_for_each_entry_safe(ip, n, node, i_gclist) {
@@ -2057,6 +2065,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
+ unsigned int cpu_nr;
unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
@@ -2064,18 +2073,28 @@ xfs_inodegc_queue(
ip->i_flags |= XFS_NEED_INACTIVE;
spin_unlock(&ip->i_flags_lock);
- gc = get_cpu_ptr(mp->m_inodegc);
+ cpu_nr = get_cpu();
+ gc = this_cpu_ptr(mp->m_inodegc);
llist_add(&ip->i_gclist, &gc->list);
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
/*
+ * Ensure the list add is always seen by anyone who finds the cpumask
+ * bit set. This effectively gives the cpumask bit set operation
+ * release ordering semantics.
+ */
+ smp_mb__before_atomic();
+ if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
+ cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
+
+ /*
* We queue the work while holding the current CPU so that the work
* is scheduled to run on this CPU.
*/
if (!xfs_is_inodegc_enabled(mp)) {
- put_cpu_ptr(gc);
+ put_cpu();
return;
}
@@ -2085,7 +2104,7 @@ xfs_inodegc_queue(
trace_xfs_inodegc_queue(mp, __return_address);
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
queue_delay);
- put_cpu_ptr(gc);
+ put_cpu();
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
@@ -2094,47 +2113,6 @@ xfs_inodegc_queue(
}
/*
- * Fold the dead CPU inodegc queue into the current CPUs queue.
- */
-void
-xfs_inodegc_cpu_dead(
- struct xfs_mount *mp,
- unsigned int dead_cpu)
-{
- struct xfs_inodegc *dead_gc, *gc;
- struct llist_node *first, *last;
- unsigned int count = 0;
-
- dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
- cancel_delayed_work_sync(&dead_gc->work);
-
- if (llist_empty(&dead_gc->list))
- return;
-
- first = dead_gc->list.first;
- last = first;
- while (last->next) {
- last = last->next;
- count++;
- }
- dead_gc->list.first = NULL;
- dead_gc->items = 0;
-
- /* Add pending work to current CPU */
- gc = get_cpu_ptr(mp->m_inodegc);
- llist_add_batch(first, last, &gc->list);
- count += READ_ONCE(gc->items);
- WRITE_ONCE(gc->items, count);
-
- if (xfs_is_inodegc_enabled(mp)) {
- trace_xfs_inodegc_queue(mp, __return_address);
- mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
- 0);
- }
- put_cpu_ptr(gc);
-}
-
-/*
* We set the inode flag atomically with the radix tree tag. Once we get tag
* lookups on the radix tree, this inode flag can go away.
*
@@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count(
if (!xfs_is_inodegc_enabled(mp))
return 0;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
return XFS_INODEGC_SHRINKER_COUNT;
@@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan(
trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
unsigned int h = READ_ONCE(gc->shrinker_hits);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 2fa6f2e09d07..905944dafbe5 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
int xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
-void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 360fe83a334f..36f5cf802c07 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -844,8 +844,8 @@ xfs_init_new_inode(
ASSERT(ip->i_nblocks == 0);
tv = inode_set_ctime_current(inode);
- inode->i_mtime = tv;
- inode->i_atime = tv;
+ inode_set_mtime_to_ts(inode, tv);
+ inode_set_atime_to_ts(inode, tv);
ip->i_extsize = 0;
ip->i_diflags = 0;
@@ -1642,8 +1642,11 @@ xfs_inode_needs_inactive(
if (VFS_I(ip)->i_mode == 0)
return false;
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (xfs_is_readonly(mp))
+ /*
+ * If this is a read-only mount, don't do this (would generate I/O)
+ * unless we're in log recovery and cleaning the iunlinked list.
+ */
+ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
return false;
/* If the log isn't running, push inodes straight to reclaim. */
@@ -1703,8 +1706,11 @@ xfs_inactive(
mp = ip->i_mount;
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (xfs_is_readonly(mp))
+ /*
+ * If this is a read-only mount, don't do this (would generate I/O)
+ * unless we're in log recovery and cleaning the iunlinked list.
+ */
+ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
goto out;
/* Metadata inodes require explicit resource cleanup. */
@@ -1736,9 +1742,21 @@ xfs_inactive(
ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
- error = xfs_qm_dqattach(ip);
- if (error)
- goto out;
+ if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
+ /*
+ * If this inode is being inactivated during a quotacheck and
+ * has not yet been scanned by quotacheck, we /must/ remove
+ * the dquots from the inode before inactivation changes the
+ * block and inode counts. Most probably this is a result of
+ * reloading the incore iunlinked list to purge unrecovered
+ * unlinked inodes.
+ */
+ xfs_qm_dqdetach(ip);
+ } else {
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ goto out;
+ }
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
@@ -1822,12 +1840,17 @@ xfs_iunlink_lookup(
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+ if (!ip) {
+ /* Caller can handle inode not being in memory. */
+ rcu_read_unlock();
+ return NULL;
+ }
/*
- * Inode not in memory or in RCU freeing limbo should not happen.
- * Warn about this and let the caller handle the failure.
+ * Inode in RCU freeing limbo should not happen. Warn about this and
+ * let the caller handle the failure.
*/
- if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ if (WARN_ON_ONCE(!ip->i_ino)) {
rcu_read_unlock();
return NULL;
}
@@ -1836,7 +1859,10 @@ xfs_iunlink_lookup(
return ip;
}
-/* Update the prev pointer of the next agino. */
+/*
+ * Update the prev pointer of the next agino. Returns -ENOLINK if the inode
+ * is not in cache.
+ */
static int
xfs_iunlink_update_backref(
struct xfs_perag *pag,
@@ -1851,7 +1877,8 @@ xfs_iunlink_update_backref(
ip = xfs_iunlink_lookup(pag, next_agino);
if (!ip)
- return -EFSCORRUPTED;
+ return -ENOLINK;
+
ip->i_prev_unlinked = prev_agino;
return 0;
}
@@ -1895,6 +1922,64 @@ xfs_iunlink_update_bucket(
return 0;
}
+/*
+ * Load the inode @next_agino into the cache and set its prev_unlinked pointer
+ * to @prev_agino. Caller must hold the AGI to synchronize with other changes
+ * to the unlinked list.
+ */
+STATIC int
+xfs_iunlink_reload_next(
+ struct xfs_trans *tp,
+ struct xfs_buf *agibp,
+ xfs_agino_t prev_agino,
+ xfs_agino_t next_agino)
+{
+ struct xfs_perag *pag = agibp->b_pag;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *next_ip = NULL;
+ xfs_ino_t ino;
+ int error;
+
+ ASSERT(next_agino != NULLAGINO);
+
+#ifdef DEBUG
+ rcu_read_lock();
+ next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
+ ASSERT(next_ip == NULL);
+ rcu_read_unlock();
+#endif
+
+ xfs_info_ratelimited(mp,
+ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
+ next_agino, pag->pag_agno);
+
+ /*
+ * Use an untrusted lookup just to be cautious in case the AGI has been
+ * corrupted and now points at a free inode. That shouldn't happen,
+ * but we'd rather shut down now since we're already running in a weird
+ * situation.
+ */
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+ error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
+ if (error)
+ return error;
+
+ /* If this is not an unlinked inode, something is very wrong. */
+ if (VFS_I(next_ip)->i_nlink != 0) {
+ error = -EFSCORRUPTED;
+ goto rele;
+ }
+
+ next_ip->i_prev_unlinked = prev_agino;
+ trace_xfs_iunlink_reload_next(next_ip);
+rele:
+ ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ if (xfs_is_quotacheck_running(mp) && next_ip)
+ xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
+ xfs_irele(next_ip);
+ return error;
+}
+
static int
xfs_iunlink_insert_inode(
struct xfs_trans *tp,
@@ -1926,6 +2011,8 @@ xfs_iunlink_insert_inode(
* inode.
*/
error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
if (error)
return error;
@@ -1941,6 +2028,7 @@ xfs_iunlink_insert_inode(
}
/* Point the head of the list to point to this inode. */
+ ip->i_prev_unlinked = NULLAGINO;
return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
@@ -2020,6 +2108,9 @@ xfs_iunlink_remove_inode(
*/
error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
ip->i_next_unlinked);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
if (error)
return error;
@@ -2040,7 +2131,7 @@ xfs_iunlink_remove_inode(
}
ip->i_next_unlinked = NULLAGINO;
- ip->i_prev_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = 0;
return error;
}
@@ -3529,3 +3620,117 @@ xfs_iunlock2_io_mmap(
if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
}
+
+/*
+ * Reload the incore inode list for this inode. Caller should ensure that
+ * the link count cannot change, either by taking ILOCK_SHARED or otherwise
+ * preventing other threads from executing.
+ */
+int
+xfs_inode_reload_unlinked_bucket(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agibp;
+ struct xfs_agi *agi;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t prev_agino, next_agino;
+ unsigned int bucket;
+ bool foundit = false;
+ int error;
+
+ /* Grab the first inode in the list */
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_ialloc_read_agi(pag, tp, &agibp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ /*
+ * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
+ * incore unlinked list pointers for this inode. Check once more to
+ * see if we raced with anyone else to reload the unlinked list.
+ */
+ if (!xfs_inode_unlinked_incomplete(ip)) {
+ foundit = true;
+ goto out_agibp;
+ }
+
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+ agi = agibp->b_addr;
+
+ trace_xfs_inode_reload_unlinked_bucket(ip);
+
+ xfs_info_ratelimited(mp,
+ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.",
+ agino, agno);
+
+ prev_agino = NULLAGINO;
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (next_agino != NULLAGINO) {
+ struct xfs_inode *next_ip = NULL;
+
+ /* Found this caller's inode, set its backlink. */
+ if (next_agino == agino) {
+ next_ip = ip;
+ next_ip->i_prev_unlinked = prev_agino;
+ foundit = true;
+ goto next_inode;
+ }
+
+ /* Try in-memory lookup first. */
+ next_ip = xfs_iunlink_lookup(pag, next_agino);
+ if (next_ip)
+ goto next_inode;
+
+ /* Inode not in memory, try reloading it. */
+ error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
+ next_agino);
+ if (error)
+ break;
+
+ /* Grab the reloaded inode. */
+ next_ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!next_ip) {
+ /* No incore inode at all? We reloaded it... */
+ ASSERT(next_ip != NULL);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+next_inode:
+ prev_agino = next_agino;
+ next_agino = next_ip->i_next_unlinked;
+ }
+
+out_agibp:
+ xfs_trans_brelse(tp, agibp);
+ /* Should have found this inode somewhere in the iunlinked bucket. */
+ if (!error && !foundit)
+ error = -EFSCORRUPTED;
+ return error;
+}
+
+/* Decide if this inode is missing its unlinked list and reload it. */
+int
+xfs_inode_reload_unlinked(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_inode_unlinked_incomplete(ip))
+ error = xfs_inode_reload_unlinked_bucket(tp, ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_trans_cancel(tp);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7547caf2f2ab..0c5bdb91152e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -68,8 +68,21 @@ typedef struct xfs_inode {
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
- /* unlinked list pointers */
+ /*
+ * Unlinked list pointers. These point to the next and previous inodes
+ * in the AGI unlinked bucket list, respectively. These fields can
+ * only be updated with the AGI locked.
+ *
+ * i_next_unlinked caches di_next_unlinked.
+ */
xfs_agino_t i_next_unlinked;
+
+ /*
+ * If the inode is not on an unlinked list, this field is zero. If the
+ * inode is the first element in an unlinked list, this field is
+ * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode
+ * in the unlinked list.
+ */
xfs_agino_t i_prev_unlinked;
/* VFS inode */
@@ -81,6 +94,11 @@ typedef struct xfs_inode {
struct list_head i_ioend_list;
} xfs_inode_t;
+static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip)
+{
+ return ip->i_prev_unlinked != 0;
+}
+
static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
{
return ip->i_forkoff > 0;
@@ -326,6 +344,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
*/
#define XFS_INACTIVATING (1 << 13)
+/* Quotacheck is running but inode has not been added to quota counts. */
+#define XFS_IQUOTAUNCHECKED (1 << 14)
+
/* All inode state flags related to inode reclaim. */
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
XFS_IRECLAIM | \
@@ -340,7 +361,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
- XFS_INACTIVATING)
+ XFS_INACTIVATING | XFS_IQUOTAUNCHECKED)
/*
* Flags for inode locking.
@@ -575,4 +596,13 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+static inline bool
+xfs_inode_unlinked_incomplete(
+ struct xfs_inode *ip)
+{
+ return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip);
+}
+int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_inode_reload_unlinked(struct xfs_inode *ip);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 127b2410eb20..17c51804f9c6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -526,8 +526,8 @@ xfs_inode_to_log_dinode(
to->di_projid_hi = ip->i_projid >> 16;
memset(to->di_pad3, 0, sizeof(to->di_pad3));
- to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
- to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+ to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
+ to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2ededd3f6b8c..fdfda4fba12b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -572,11 +572,11 @@ xfs_vn_getattr(
stat->uid = vfsuid_into_kuid(vfsuid);
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
- stat->atime = inode->i_atime;
+ stat->atime = inode_get_atime(inode);
+ stat->mtime = inode_get_mtime(inode);
+ stat->ctime = inode_get_ctime(inode);
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
- fill_mg_cmtime(stat, request_mask, inode);
-
if (xfs_has_v3inodes(mp)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
@@ -584,6 +584,11 @@ xfs_vn_getattr(
}
}
+ if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
+ stat->change_cookie = inode_query_iversion(inode);
+ stat->result_mask |= STATX_CHANGE_COOKIE;
+ }
+
/*
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
@@ -917,7 +922,7 @@ xfs_setattr_size(
if (newsize != oldsize &&
!(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
- current_mgtime(inode);
+ current_time(inode);
iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
@@ -1062,9 +1067,9 @@ xfs_vn_update_time(
now = current_time(inode);
if (flags & S_MTIME)
- inode->i_mtime = now;
+ inode_set_mtime_to_ts(inode, now);
if (flags & S_ATIME)
- inode->i_atime = now;
+ inode_set_atime_to_ts(inode, now);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, log_flags);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c2093cb56092..14462614fcc8 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -80,6 +80,17 @@ xfs_bulkstat_one_int(
if (error)
goto out;
+ /* Reload the incore unlinked list to avoid failure in inodegc. */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked_bucket(tp, ip);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_irele(ip);
+ return error;
+ }
+ }
+
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
inode = VFS_I(ip);
@@ -96,12 +107,12 @@ xfs_bulkstat_one_int(
buf->bs_size = ip->i_disk_size;
buf->bs_nlink = inode->i_nlink;
- buf->bs_atime = inode->i_atime.tv_sec;
- buf->bs_atime_nsec = inode->i_atime.tv_nsec;
- buf->bs_mtime = inode->i_mtime.tv_sec;
- buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime = inode_get_ctime(inode).tv_sec;
- buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec;
+ buf->bs_atime = inode_get_atime_sec(inode);
+ buf->bs_atime_nsec = inode_get_atime_nsec(inode);
+ buf->bs_mtime = inode_get_mtime_sec(inode);
+ buf->bs_mtime_nsec = inode_get_mtime_nsec(inode);
+ buf->bs_ctime = inode_get_ctime_sec(inode);
+ buf->bs_ctime_nsec = inode_get_ctime_nsec(inode);
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 79004d193e54..51c100c86177 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -715,15 +715,7 @@ xfs_log_mount(
* just worked.
*/
if (!xfs_has_norecovery(mp)) {
- /*
- * log recovery ignores readonly state and so we need to clear
- * mount-based read only state so it can write to disk.
- */
- bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
- &mp->m_opstate);
error = xlog_recover(log);
- if (readonly)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
@@ -772,7 +764,6 @@ xfs_log_mount_finish(
struct xfs_mount *mp)
{
struct xlog *log = mp->m_log;
- bool readonly;
int error = 0;
if (xfs_has_norecovery(mp)) {
@@ -781,12 +772,6 @@ xfs_log_mount_finish(
}
/*
- * log recovery ignores readonly state and so we need to clear
- * mount-based read only state so it can write to disk.
- */
- readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
-
- /*
* During the second phase of log recovery, we need iget and
* iput to behave like they do for an active filesystem.
* xfs_fs_drop_inode needs to be able to prevent the deletion
@@ -835,8 +820,6 @@ xfs_log_mount_finish(
xfs_buftarg_drain(mp->m_ddev_targp);
clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
- if (readonly)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
/* Make sure the log is dead if we're returning failure. */
ASSERT(!error || xlog_is_shutdown(log));
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index eccbfb99e894..67a99d94701e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -16,8 +16,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_trace.h"
-
-struct workqueue_struct *xfs_discard_wq;
+#include "xfs_discard.h"
/*
* Allocate a new ticket. Failing to get a new ticket makes it really hard to
@@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void)
ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
INIT_LIST_HEAD(&ctx->committing);
- INIT_LIST_HEAD(&ctx->busy_extents);
+ INIT_LIST_HEAD(&ctx->busy_extents.extent_list);
INIT_LIST_HEAD(&ctx->log_items);
INIT_LIST_HEAD(&ctx->lv_chain);
INIT_WORK(&ctx->push_work, xlog_cil_push_work);
@@ -124,7 +123,7 @@ xlog_cil_push_pcp_aggregate(
struct xlog_cil_pcp *cilpcp;
int cpu;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &ctx->cil_pcpmask) {
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
ctx->ticket->t_curr_res += cilpcp->space_reserved;
@@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate(
if (!list_empty(&cilpcp->busy_extents)) {
list_splice_init(&cilpcp->busy_extents,
- &ctx->busy_extents);
+ &ctx->busy_extents.extent_list);
}
if (!list_empty(&cilpcp->log_items))
list_splice_init(&cilpcp->log_items, &ctx->log_items);
@@ -165,7 +164,13 @@ xlog_cil_insert_pcp_aggregate(
if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
return;
- for_each_online_cpu(cpu) {
+ /*
+ * We can race with other cpus setting cil_pcpmask. However, we've
+ * atomically cleared PCP_SPACE which forces other threads to add to
+ * the global space used count. cil_pcpmask is a superset of cilpcp
+ * structures that could have a nonzero space_used.
+ */
+ for_each_cpu(cpu, &ctx->cil_pcpmask) {
int old, prev;
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
@@ -554,6 +559,7 @@ xlog_cil_insert_items(
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
int space_used;
int order;
+ unsigned int cpu_nr;
struct xlog_cil_pcp *cilpcp;
ASSERT(tp);
@@ -577,7 +583,12 @@ xlog_cil_insert_items(
* can't be scheduled away between split sample/update operations that
* are done without outside locking to serialise them.
*/
- cilpcp = get_cpu_ptr(cil->xc_pcp);
+ cpu_nr = get_cpu();
+ cilpcp = this_cpu_ptr(cil->xc_pcp);
+
+ /* Tell the future push that there was work added by this CPU. */
+ if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask))
+ cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask);
/*
* We need to take the CIL checkpoint unit reservation on the first
@@ -663,7 +674,7 @@ xlog_cil_insert_items(
continue;
list_add_tail(&lip->li_cil, &cilpcp->log_items);
}
- put_cpu_ptr(cilpcp);
+ put_cpu();
/*
* If we've overrun the reservation, dump the tx details before we move
@@ -696,76 +707,6 @@ xlog_cil_free_logvec(
}
}
-static void
-xlog_discard_endio_work(
- struct work_struct *work)
-{
- struct xfs_cil_ctx *ctx =
- container_of(work, struct xfs_cil_ctx, discard_endio_work);
- struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
-
- xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
- kmem_free(ctx);
-}
-
-/*
- * Queue up the actual completion to a thread to avoid IRQ-safe locking for
- * pagb_lock. Note that we need a unbounded workqueue, otherwise we might
- * get the execution delayed up to 30 seconds for weird reasons.
- */
-static void
-xlog_discard_endio(
- struct bio *bio)
-{
- struct xfs_cil_ctx *ctx = bio->bi_private;
-
- INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
- queue_work(xfs_discard_wq, &ctx->discard_endio_work);
- bio_put(bio);
-}
-
-static void
-xlog_discard_busy_extents(
- struct xfs_mount *mp,
- struct xfs_cil_ctx *ctx)
-{
- struct list_head *list = &ctx->busy_extents;
- struct xfs_extent_busy *busyp;
- struct bio *bio = NULL;
- struct blk_plug plug;
- int error = 0;
-
- ASSERT(xfs_has_discard(mp));
-
- blk_start_plug(&plug);
- list_for_each_entry(busyp, list, list) {
- trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
- busyp->length);
-
- error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
- XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
- XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, &bio);
- if (error && error != -EOPNOTSUPP) {
- xfs_info(mp,
- "discard failed for extent [0x%llx,%u], error %d",
- (unsigned long long)busyp->bno,
- busyp->length,
- error);
- break;
- }
- }
-
- if (bio) {
- bio->bi_private = ctx;
- bio->bi_end_io = xlog_discard_endio;
- submit_bio(bio);
- } else {
- xlog_discard_endio_work(&ctx->discard_endio_work);
- }
- blk_finish_plug(&plug);
-}
-
/*
* Mark all items committed and clear busy extents. We free the log vector
* chains in a separate pass so that we unpin the log items as quickly as
@@ -795,8 +736,8 @@ xlog_cil_committed(
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
ctx->start_lsn, abort);
- xfs_extent_busy_sort(&ctx->busy_extents);
- xfs_extent_busy_clear(mp, &ctx->busy_extents,
+ xfs_extent_busy_sort(&ctx->busy_extents.extent_list);
+ xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list,
xfs_has_discard(mp) && !abort);
spin_lock(&ctx->cil->xc_push_lock);
@@ -805,10 +746,14 @@ xlog_cil_committed(
xlog_cil_free_logvec(&ctx->lv_chain);
- if (!list_empty(&ctx->busy_extents))
- xlog_discard_busy_extents(mp, ctx);
- else
- kmem_free(ctx);
+ if (!list_empty(&ctx->busy_extents.extent_list)) {
+ ctx->busy_extents.mount = mp;
+ ctx->busy_extents.owner = ctx;
+ xfs_discard_extents(mp, &ctx->busy_extents);
+ return;
+ }
+
+ kmem_free(ctx);
}
void
@@ -1791,38 +1736,6 @@ out_shutdown:
}
/*
- * Move dead percpu state to the relevant CIL context structures.
- *
- * We have to lock the CIL context here to ensure that nothing is modifying
- * the percpu state, either addition or removal. Both of these are done under
- * the CIL context lock, so grabbing that exclusively here will ensure we can
- * safely drain the cilpcp for the CPU that is dying.
- */
-void
-xlog_cil_pcp_dead(
- struct xlog *log,
- unsigned int cpu)
-{
- struct xfs_cil *cil = log->l_cilp;
- struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
- struct xfs_cil_ctx *ctx;
-
- down_write(&cil->xc_ctx_lock);
- ctx = cil->xc_ctx;
- if (ctx->ticket)
- ctx->ticket->t_curr_res += cilpcp->space_reserved;
- cilpcp->space_reserved = 0;
-
- if (!list_empty(&cilpcp->log_items))
- list_splice_init(&cilpcp->log_items, &ctx->log_items);
- if (!list_empty(&cilpcp->busy_extents))
- list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
- atomic_add(cilpcp->space_used, &ctx->space_used);
- cilpcp->space_used = 0;
- up_write(&cil->xc_ctx_lock);
-}
-
-/*
* Perform initial CIL structure initialisation.
*/
int
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1bd2963e8fbd..fa3ad1d7b31c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -6,6 +6,8 @@
#ifndef __XFS_LOG_PRIV_H__
#define __XFS_LOG_PRIV_H__
+#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */
+
struct xfs_buf;
struct xlog;
struct xlog_ticket;
@@ -223,14 +225,19 @@ struct xfs_cil_ctx {
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
atomic_t space_used; /* aggregate size of regions */
- struct list_head busy_extents; /* busy extents in chkpt */
+ struct xfs_busy_extents busy_extents;
struct list_head log_items; /* log items in chkpt */
struct list_head lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
- struct work_struct discard_endio_work;
struct work_struct push_work;
atomic_t order_id;
+
+ /*
+ * CPUs that could have added items to the percpu CIL data. Access is
+ * coordinated with xc_ctx_lock.
+ */
+ struct cpumask cil_pcpmask;
};
/*
@@ -278,9 +285,6 @@ struct xfs_cil {
wait_queue_head_t xc_push_wait; /* background push throttle */
void __percpu *xc_pcp; /* percpu CIL structures */
-#ifdef CONFIG_HOTPLUG_CPU
- struct list_head xc_pcp_list;
-#endif
} ____cacheline_aligned_in_smp;
/* xc_flags bit values */
@@ -705,9 +709,4 @@ xlog_kvmalloc(
return p;
}
-/*
- * CIL CPU dead notifier
- */
-void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
-
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82c81d20459d..13b94d2e605b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -329,7 +329,7 @@ xlog_find_verify_cycle(
* try a smaller size. We need to be able to read at least
* a log sector, or we're out of luck.
*/
- bufblks = 1 << ffs(nbblks);
+ bufblks = roundup_pow_of_two(nbblks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
@@ -1528,7 +1528,7 @@ xlog_write_log_records(
* a smaller size. We need to be able to write at least a
* log sector, or we're out of luck.
*/
- bufblks = 1 << ffs(blocks);
+ bufblks = roundup_pow_of_two(blocks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a25eece3be2b..d19cca099bc3 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -60,6 +60,7 @@ struct xfs_error_cfg {
* Per-cpu deferred inode inactivation GC lists.
*/
struct xfs_inodegc {
+ struct xfs_mount *mp;
struct llist_head list;
struct delayed_work work;
int error;
@@ -67,9 +68,7 @@ struct xfs_inodegc {
/* approximate count of inodes in the list */
unsigned int items;
unsigned int shrinker_hits;
-#if defined(DEBUG) || defined(XFS_WARN)
unsigned int cpu;
-#endif
};
/*
@@ -98,7 +97,6 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
- struct list_head m_mount_list; /* global mount list */
void __percpu *m_inodegc; /* percpu inodegc structures */
/*
@@ -249,6 +247,9 @@ typedef struct xfs_mount {
unsigned int *m_errortag;
struct xfs_kobj m_errortag_kobj;
#endif
+
+ /* cpus that have inodes queued for inactivation */
+ struct cpumask m_inodegc_cpumask;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)
@@ -404,6 +405,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_SHRINK 8
/* Kernel has logged a warning about logged xattr updates being used. */
#define XFS_OPSTATE_WARNED_LARP 9
+/* Mount time quotacheck is running */
+#define XFS_OPSTATE_QUOTACHECK_RUNNING 10
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -426,6 +429,11 @@ __XFS_IS_OPSTATE(inode32, INODE32)
__XFS_IS_OPSTATE(readonly, READONLY)
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+#ifdef CONFIG_XFS_QUOTA
+__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
+#else
+# define xfs_is_quotacheck_running(mp) (false)
+#endif
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -443,7 +451,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
{ (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
- { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }
+ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
+ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }
/*
* Max and min values for mount-option defined I/O
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 4a9bbd3fe120..a7daa522e00f 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure(
struct xfs_rmap_irec ri_low = { };
struct xfs_rmap_irec ri_high;
struct xfs_agf *agf;
- xfs_agblock_t agend;
struct xfs_perag *pag;
+ xfs_agblock_t range_agend;
pag = xfs_perag_get(mp, agno);
error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
@@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure(
ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
agf = agf_bp->b_addr;
- agend = min(be32_to_cpu(agf->agf_length),
+ range_agend = min(be32_to_cpu(agf->agf_length) - 1,
ri_high.rm_startblock);
notify.startblock = ri_low.rm_startblock;
- notify.blockcount = agend - ri_low.rm_startblock;
+ notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
xfs_dax_failure_fn, &notify);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6abcc34fafd8..086e78a6143a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1160,6 +1160,19 @@ xfs_qm_dqusage_adjust(
if (error)
return error;
+ /*
+ * Reload the incore unlinked list to avoid failure in inodegc.
+ * Use an unlocked check here because unrecovered unlinked inodes
+ * should be somewhat rare.
+ */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked(ip);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ goto error0;
+ }
+ }
+
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
@@ -1173,6 +1186,7 @@ xfs_qm_dqusage_adjust(
}
nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
+ xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED);
/*
* Add the (disk blocks and inode) resources occupied by this
@@ -1319,8 +1333,10 @@ xfs_qm_quotacheck(
flags |= XFS_PQUOTA_CHKD;
}
+ xfs_set_quotacheck_running(mp);
error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
NULL);
+ xfs_clear_quotacheck_running(mp);
/*
* On error, the inode walk may have partially populated the dquot
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index edd8587658d5..2d4444d61e98 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -477,6 +477,7 @@ xfs_cui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
@@ -514,8 +515,9 @@ xfs_cui_item_recover(
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 520c7ebdfed8..0e0e747028da 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -507,6 +507,7 @@ xfs_rui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
@@ -530,8 +531,9 @@ xfs_rui_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
rudp = xfs_trans_get_rud(tp, ruip);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 16534e9873f6..2e1a4e5cd03d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1420,25 +1420,26 @@ xfs_rtunmount_inodes(
*/
int /* error */
xfs_rtpick_extent(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_extlen_t len, /* allocation length (rtextents) */
- xfs_rtblock_t *pick) /* result rt extent */
-{
- xfs_rtblock_t b; /* result block */
- int log2; /* log of sequence number */
- uint64_t resid; /* residual after log removed */
- uint64_t seq; /* sequence number of file creation */
- uint64_t *seqp; /* pointer to seqno in inode */
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_extlen_t len, /* allocation length (rtextents) */
+ xfs_rtblock_t *pick) /* result rt extent */
+ {
+ xfs_rtblock_t b; /* result block */
+ int log2; /* log of sequence number */
+ uint64_t resid; /* residual after log removed */
+ uint64_t seq; /* sequence number of file creation */
+ struct timespec64 ts; /* temporary timespec64 storage */
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
- *seqp = 0;
+ seq = 0;
+ } else {
+ ts = inode_get_atime(VFS_I(mp->m_rbmip));
+ seq = (uint64_t)ts.tv_sec;
}
- seq = *seqp;
if ((log2 = xfs_highbit64(seq)) == -1)
b = 0;
else {
@@ -1450,7 +1451,8 @@ xfs_rtpick_extent(
if (b + len > mp->m_sb.sb_rextents)
b = mp->m_sb.sb_rextents - len;
}
- *seqp = seq + 1;
+ ts.tv_sec = (time64_t)seq + 1;
+ inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
*pick = b;
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1f77014c6e1a..f0ae07828153 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -56,28 +56,6 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
-#ifdef CONFIG_HOTPLUG_CPU
-static LIST_HEAD(xfs_mount_list);
-static DEFINE_SPINLOCK(xfs_mount_list_lock);
-
-static inline void xfs_mount_list_add(struct xfs_mount *mp)
-{
- spin_lock(&xfs_mount_list_lock);
- list_add(&mp->m_mount_list, &xfs_mount_list);
- spin_unlock(&xfs_mount_list_lock);
-}
-
-static inline void xfs_mount_list_del(struct xfs_mount *mp)
-{
- spin_lock(&xfs_mount_list_lock);
- list_del(&mp->m_mount_list);
- spin_unlock(&xfs_mount_list_lock);
-}
-#else /* !CONFIG_HOTPLUG_CPU */
-static inline void xfs_mount_list_add(struct xfs_mount *mp) {}
-static inline void xfs_mount_list_del(struct xfs_mount *mp) {}
-#endif
-
enum xfs_dax_mode {
XFS_DAX_INODE = 0,
XFS_DAX_ALWAYS = 1,
@@ -383,14 +361,15 @@ STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
const char *name,
- struct block_device **bdevp)
+ struct bdev_handle **handlep)
{
int error = 0;
- *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
- mp->m_super, &fs_holder_ops);
- if (IS_ERR(*bdevp)) {
- error = PTR_ERR(*bdevp);
+ *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ mp->m_super, &fs_holder_ops);
+ if (IS_ERR(*handlep)) {
+ error = PTR_ERR(*handlep);
+ *handlep = NULL;
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
}
@@ -455,7 +434,7 @@ xfs_open_devices(
{
struct super_block *sb = mp->m_super;
struct block_device *ddev = sb->s_bdev;
- struct block_device *logdev = NULL, *rtdev = NULL;
+ struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL;
int error;
/*
@@ -468,17 +447,19 @@ xfs_open_devices(
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
- error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
if (error)
goto out_relock;
}
if (mp->m_rtname) {
- error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
if (error)
goto out_close_logdev;
- if (rtdev == ddev || rtdev == logdev) {
+ if (rtdev_handle->bdev == ddev ||
+ (logdev_handle &&
+ rtdev_handle->bdev == logdev_handle->bdev)) {
xfs_warn(mp,
"Cannot mount filesystem with identical rtdev and ddev/logdev.");
error = -EINVAL;
@@ -490,22 +471,25 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
- if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+ if (rtdev_handle) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
- if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+ if (logdev_handle && logdev_handle->bdev != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
+ /* Handle won't be used, drop it */
+ if (logdev_handle)
+ bdev_release(logdev_handle);
}
error = 0;
@@ -519,11 +503,11 @@ out_relock:
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- if (rtdev)
- blkdev_put(rtdev, sb);
+ if (rtdev_handle)
+ bdev_release(rtdev_handle);
out_close_logdev:
- if (logdev && logdev != ddev)
- blkdev_put(logdev, sb);
+ if (logdev_handle)
+ bdev_release(logdev_handle);
goto out_relock;
}
@@ -1135,9 +1119,8 @@ xfs_inodegc_init_percpu(
for_each_possible_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
-#if defined(DEBUG) || defined(XFS_WARN)
gc->cpu = cpu;
-#endif
+ gc->mp = mp;
init_llist_head(&gc->list);
gc->items = 0;
gc->error = 0;
@@ -1168,7 +1151,6 @@ xfs_fs_put_super(
xfs_freesb(mp);
xchk_mount_stats_free(mp);
free_percpu(mp->m_stats.xs_stats);
- xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
@@ -1577,13 +1559,6 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_counters;
- /*
- * All percpu data structures requiring cleanup when a cpu goes offline
- * must be allocated before adding this @mp to the cpu-dead handler's
- * mount list.
- */
- xfs_mount_list_add(mp);
-
/* Allocate stats memory before we do operations that might use it */
mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
if (!mp->m_stats.xs_stats) {
@@ -1781,7 +1756,6 @@ xfs_fs_fill_super(
out_free_stats:
free_percpu(mp->m_stats.xs_stats);
out_destroy_inodegc:
- xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
@@ -2065,7 +2039,7 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("xfs");
@@ -2326,49 +2300,6 @@ xfs_destroy_workqueues(void)
destroy_workqueue(xfs_alloc_wq);
}
-#ifdef CONFIG_HOTPLUG_CPU
-static int
-xfs_cpu_dead(
- unsigned int cpu)
-{
- struct xfs_mount *mp, *n;
-
- spin_lock(&xfs_mount_list_lock);
- list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
- spin_unlock(&xfs_mount_list_lock);
- xfs_inodegc_cpu_dead(mp, cpu);
- xlog_cil_pcp_dead(mp->m_log, cpu);
- spin_lock(&xfs_mount_list_lock);
- }
- spin_unlock(&xfs_mount_list_lock);
- return 0;
-}
-
-static int __init
-xfs_cpu_hotplug_init(void)
-{
- int error;
-
- error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
- xfs_cpu_dead);
- if (error < 0)
- xfs_alert(NULL,
-"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
- error);
- return error;
-}
-
-static void
-xfs_cpu_hotplug_destroy(void)
-{
- cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
-}
-
-#else /* !CONFIG_HOTPLUG_CPU */
-static inline int xfs_cpu_hotplug_init(void) { return 0; }
-static inline void xfs_cpu_hotplug_destroy(void) {}
-#endif
-
STATIC int __init
init_xfs_fs(void)
{
@@ -2385,13 +2316,9 @@ init_xfs_fs(void)
xfs_dir_startup();
- error = xfs_cpu_hotplug_init();
- if (error)
- goto out;
-
error = xfs_init_caches();
if (error)
- goto out_destroy_hp;
+ goto out;
error = xfs_init_workqueues();
if (error)
@@ -2475,8 +2402,6 @@ init_xfs_fs(void)
xfs_destroy_workqueues();
out_destroy_caches:
xfs_destroy_caches();
- out_destroy_hp:
- xfs_cpu_hotplug_destroy();
out:
return error;
}
@@ -2500,7 +2425,6 @@ exit_xfs_fs(void)
xfs_destroy_workqueues();
xfs_destroy_caches();
xfs_uuid_table_free();
- xfs_cpu_hotplug_destroy();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 902c7f67a117..3926cf7f2a6e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3824,6 +3824,51 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
__entry->new_ptr)
);
+TRACE_EVENT(xfs_iunlink_reload_next,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xfs_inode_reload_unlinked_bucket,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS)
+);
+
DECLARE_EVENT_CLASS(xfs_ag_inode_class,
TP_PROTO(struct xfs_inode *ip),
TP_ARGS(ip),
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 43e5c219aaed..987843f84d03 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -46,6 +46,17 @@ xfs_attr_grab_log_assist(
if (xfs_sb_version_haslogxattrs(&mp->m_sb))
return 0;
+ /*
+ * Check if the filesystem featureset is new enough to set this log
+ * incompat feature bit. Strictly speaking, the minimum requirement is
+ * a V5 filesystem for the superblock field, but we'll require rmap
+ * or reflink to avoid having to deal with really old kernels.
+ */
+ if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) {
+ error = -EOPNOTSUPP;
+ goto drop_incompat;
+ }
+
/* Enable log-assisted xattrs. */
error = xfs_add_incompat_log_feature(mp,
XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
@@ -175,7 +186,7 @@ static const struct xattr_handler xfs_xattr_security_handler = {
.set = xfs_xattr_set,
};
-const struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler * const xfs_xattr_handlers[] = {
&xfs_xattr_user_handler,
&xfs_xattr_trusted_handler,
&xfs_xattr_security_handler,
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
index 2b09133b1b9b..cec766cad26c 100644
--- a/fs/xfs/xfs_xattr.h
+++ b/fs/xfs/xfs_xattr.h
@@ -8,6 +8,6 @@
int xfs_attr_change(struct xfs_da_args *args);
-extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler * const xfs_xattr_handlers[];
#endif /* __XFS_XATTR_H__ */
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9d1a9808fbbb..e6a75401677d 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -658,8 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
inode->i_ino = ino;
inode->i_mode = z->z_mode;
- inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
- inode_get_ctime(dir));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(dir))));
inode->i_uid = z->z_uid;
inode->i_gid = z->z_gid;
inode->i_size = z->z_wpoffset;
@@ -695,8 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
inode->i_ino = ino;
inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
- inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
- inode_get_ctime(root));
+ inode_set_mtime_to_ts(inode,
+ inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(root))));
inode->i_private = &sbi->s_zgroup[ztype];
set_nlink(inode, 2);
@@ -1319,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
inode->i_ino = bdev_nr_zones(sb->s_bdev);
inode->i_mode = S_IFDIR | 0555;
- inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ simple_inode_init_ts(inode);
inode->i_op = &zonefs_dir_inode_operations;
inode->i_fop = &zonefs_dir_operations;
inode->i_size = 2;