summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c4
-rw-r--r--fs/9p/fid.c75
-rw-r--r--fs/9p/fid.h6
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs.h10
-rw-r--r--fs/9p/vfs_addr.c53
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_dir.c2
-rw-r--r--fs/9p/vfs_file.c9
-rw-r--r--fs/9p/vfs_inode.c102
-rw-r--r--fs/9p/vfs_inode_dotl.c79
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/9p/xattr.c8
-rw-r--r--fs/Kconfig24
-rw-r--r--fs/Kconfig.binfmt41
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/inode.c10
-rw-r--r--fs/affs/file.c27
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/affs/symlink.c5
-rw-r--r--fs/afs/callback.c2
-rw-r--r--fs/afs/cell.c61
-rw-r--r--fs/afs/cmservice.c4
-rw-r--r--fs/afs/dir.c67
-rw-r--r--fs/afs/dir_edit.c10
-rw-r--r--fs/afs/dir_silly.c4
-rw-r--r--fs/afs/dynroot.c2
-rw-r--r--fs/afs/file.c40
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/fs_operation.c6
-rw-r--r--fs/afs/fsclient.c2
-rw-r--r--fs/afs/inode.c55
-rw-r--r--fs/afs/internal.h46
-rw-r--r--fs/afs/misc.c6
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/afs/proc.c6
-rw-r--r--fs/afs/rotate.c4
-rw-r--r--fs/afs/rxrpc.c46
-rw-r--r--fs/afs/security.c3
-rw-r--r--fs/afs/server.c46
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/vl_list.c19
-rw-r--r--fs/afs/volume.c24
-rw-r--r--fs/afs/write.c31
-rw-r--r--fs/afs/yfsclient.c3
-rw-r--r--fs/aio.c38
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/attr.c80
-rw-r--r--fs/autofs/autofs_i.h7
-rw-r--r--fs/autofs/expire.c2
-rw-r--r--fs/autofs/inode.c1
-rw-r--r--fs/autofs/root.c108
-rw-r--r--fs/befs/linuxvfs.c29
-rw-r--r--fs/bfs/file.c11
-rw-r--r--fs/binfmt_aout.c342
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_flat.c239
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c39
-rw-r--r--fs/btrfs/async-thread.c122
-rw-r--r--fs/btrfs/async-thread.h8
-rw-r--r--fs/btrfs/backref.c243
-rw-r--r--fs/btrfs/backref.h23
-rw-r--r--fs/btrfs/block-group.c496
-rw-r--r--fs/btrfs/block-group.h53
-rw-r--r--fs/btrfs/block-rsv.c24
-rw-r--r--fs/btrfs/block-rsv.h24
-rw-r--r--fs/btrfs/btrfs_inode.h80
-rw-r--r--fs/btrfs/check-integrity.c176
-rw-r--r--fs/btrfs/check-integrity.h6
-rw-r--r--fs/btrfs/compression.c461
-rw-r--r--fs/btrfs/compression.h24
-rw-r--r--fs/btrfs/ctree.c148
-rw-r--r--fs/btrfs/ctree.h576
-rw-r--r--fs/btrfs/delalloc-space.c28
-rw-r--r--fs/btrfs/delalloc-space.h3
-rw-r--r--fs/btrfs/delayed-inode.c649
-rw-r--r--fs/btrfs/delayed-inode.h45
-rw-r--r--fs/btrfs/delayed-ref.c8
-rw-r--r--fs/btrfs/delayed-ref.h1
-rw-r--r--fs/btrfs/dev-replace.c83
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/dir-item.c31
-rw-r--r--fs/btrfs/disk-io.c838
-rw-r--r--fs/btrfs/disk-io.h42
-rw-r--r--fs/btrfs/extent-io-tree.c1673
-rw-r--r--fs/btrfs/extent-io-tree.h126
-rw-r--r--fs/btrfs/extent-tree.c315
-rw-r--r--fs/btrfs/extent_io.c3839
-rw-r--r--fs/btrfs/extent_io.h77
-rw-r--r--fs/btrfs/extent_map.c347
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c38
-rw-r--r--fs/btrfs/file.c1229
-rw-r--r--fs/btrfs/free-space-cache.c129
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/free-space-tree.c10
-rw-r--r--fs/btrfs/inode.c3189
-rw-r--r--fs/btrfs/ioctl.c479
-rw-r--r--fs/btrfs/locking.c119
-rw-r--r--fs/btrfs/locking.h15
-rw-r--r--fs/btrfs/lzo.c28
-rw-r--r--fs/btrfs/misc.h35
-rw-r--r--fs/btrfs/ordered-data.c90
-rw-r--r--fs/btrfs/ordered-data.h18
-rw-r--r--fs/btrfs/props.c104
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c103
-rw-r--r--fs/btrfs/qgroup.h15
-rw-r--r--fs/btrfs/raid56.c1364
-rw-r--r--fs/btrfs/raid56.h169
-rw-r--r--fs/btrfs/reflink.c64
-rw-r--r--fs/btrfs/relocation.c81
-rw-r--r--fs/btrfs/root-tree.c22
-rw-r--r--fs/btrfs/scrub.c2390
-rw-r--r--fs/btrfs/send.c1486
-rw-r--r--fs/btrfs/send.h180
-rw-r--r--fs/btrfs/space-info.c211
-rw-r--r--fs/btrfs/space-info.h23
-rw-r--r--fs/btrfs/struct-funcs.c11
-rw-r--r--fs/btrfs/subpage.c59
-rw-r--r--fs/btrfs/subpage.h2
-rw-r--r--fs/btrfs/super.c192
-rw-r--r--fs/btrfs/sysfs.c396
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/tests/extent-io-tests.c7
-rw-r--r--fs/btrfs/tests/free-space-tests.c22
-rw-r--r--fs/btrfs/tests/inode-tests.c10
-rw-r--r--fs/btrfs/transaction.c188
-rw-r--r--fs/btrfs/tree-checker.c80
-rw-r--r--fs/btrfs/tree-checker.h1
-rw-r--r--fs/btrfs/tree-log.c1639
-rw-r--r--fs/btrfs/tree-log.h11
-rw-r--r--fs/btrfs/verity.c3
-rw-r--r--fs/btrfs/volumes.c784
-rw-r--r--fs/btrfs/volumes.h145
-rw-r--r--fs/btrfs/xattr.c54
-rw-r--r--fs/btrfs/zlib.c42
-rw-r--r--fs/btrfs/zoned.c662
-rw-r--r--fs/btrfs/zoned.h50
-rw-r--r--fs/btrfs/zstd.c47
-rw-r--r--fs/buffer.c589
-rw-r--r--fs/cachefiles/Kconfig12
-rw-r--r--fs/cachefiles/Makefile1
-rw-r--r--fs/cachefiles/daemon.c117
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/cachefiles/internal.h79
-rw-r--r--fs/cachefiles/io.c76
-rw-r--r--fs/cachefiles/namei.c49
-rw-r--r--fs/cachefiles/ondemand.c514
-rw-r--r--fs/cachefiles/xattr.c2
-rw-r--r--fs/ceph/addr.c169
-rw-r--r--fs/ceph/cache.c4
-rw-r--r--fs/ceph/cache.h2
-rw-r--r--fs/ceph/caps.c225
-rw-r--r--fs/ceph/dir.c79
-rw-r--r--fs/ceph/file.c150
-rw-r--r--fs/ceph/inode.c56
-rw-r--r--fs/ceph/mds_client.c296
-rw-r--r--fs/ceph/mds_client.h15
-rw-r--r--fs/ceph/mdsmap.c22
-rw-r--r--fs/ceph/quota.c19
-rw-r--r--fs/ceph/snap.c8
-rw-r--r--fs/ceph/super.c22
-rw-r--r--fs/ceph/super.h70
-rw-r--r--fs/ceph/xattr.c36
-rw-r--r--fs/cifs/Makefile8
-rw-r--r--fs/cifs/cached_dir.c388
-rw-r--r--fs/cifs/cached_dir.h64
-rw-r--r--fs/cifs/cifs_debug.c97
-rw-r--r--fs/cifs/cifs_swn.c4
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c17
-rw-r--r--fs/cifs/cifsfs.c106
-rw-r--r--fs/cifs/cifsfs.h7
-rw-r--r--fs/cifs/cifsglob.h352
-rw-r--r--fs/cifs/cifsproto.h29
-rw-r--r--fs/cifs/cifsroot.c2
-rw-r--r--fs/cifs/cifssmb.c480
-rw-r--r--fs/cifs/connect.c537
-rw-r--r--fs/cifs/dfs_cache.c123
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c378
-rw-r--r--fs/cifs/fs_context.c42
-rw-r--r--fs/cifs/fs_context.h12
-rw-r--r--fs/cifs/fscache.c8
-rw-r--r--fs/cifs/fscache.h26
-rw-r--r--fs/cifs/inode.c69
-rw-r--r--fs/cifs/ioctl.c2
-rw-r--r--fs/cifs/link.c11
-rw-r--r--fs/cifs/misc.c111
-rw-r--r--fs/cifs/netlink.c1
-rw-r--r--fs/cifs/netmisc.c6
-rw-r--r--fs/cifs/readdir.c186
-rw-r--r--fs/cifs/sess.c229
-rw-r--r--fs/cifs/smb1ops.c16
-rw-r--r--fs/cifs/smb2file.c1
-rw-r--r--fs/cifs/smb2inode.c16
-rw-r--r--fs/cifs/smb2misc.c82
-rw-r--r--fs/cifs/smb2ops.c723
-rw-r--r--fs/cifs/smb2pdu.c123
-rw-r--r--fs/cifs/smb2pdu.h22
-rw-r--r--fs/cifs/smb2proto.h16
-rw-r--r--fs/cifs/smb2transport.c45
-rw-r--r--fs/cifs/smbdirect.c8
-rw-r--r--fs/cifs/trace.h47
-rw-r--r--fs/cifs/transport.c402
-rw-r--r--fs/cifs/xattr.c5
-rw-r--r--fs/coda/symlink.c16
-rw-r--r--fs/coredump.c45
-rw-r--r--fs/cramfs/README8
-rw-r--r--fs/cramfs/inode.c24
-rw-r--r--fs/crypto/bio.c16
-rw-r--r--fs/crypto/crypto.c10
-rw-r--r--fs/crypto/fname.c47
-rw-r--r--fs/crypto/fscrypt_private.h103
-rw-r--r--fs/crypto/hooks.c16
-rw-r--r--fs/crypto/inline_crypt.c221
-rw-r--r--fs/crypto/keyring.c559
-rw-r--r--fs/crypto/keysetup.c118
-rw-r--r--fs/crypto/keysetup_v1.c4
-rw-r--r--fs/crypto/policy.c178
-rw-r--r--fs/d_path.c5
-rw-r--r--fs/dax.c522
-rw-r--r--fs/dcache.c141
-rw-r--r--fs/debugfs/file.c16
-rw-r--r--fs/debugfs/inode.c59
-rw-r--r--fs/direct-io.c81
-rw-r--r--fs/dlm/Kconfig9
-rw-r--r--fs/dlm/Makefile2
-rw-r--r--fs/dlm/ast.c19
-rw-r--r--fs/dlm/ast.h1
-rw-r--r--fs/dlm/config.c21
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/dir.c2
-rw-r--r--fs/dlm/dlm_internal.h100
-rw-r--r--fs/dlm/lock.c958
-rw-r--r--fs/dlm/lock.h19
-rw-r--r--fs/dlm/lockspace.c75
-rw-r--r--fs/dlm/lockspace.h14
-rw-r--r--fs/dlm/lowcomms.c20
-rw-r--r--fs/dlm/member.c41
-rw-r--r--fs/dlm/midcomms.c61
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c213
-rw-r--r--fs/dlm/rcom.c120
-rw-r--r--fs/dlm/recover.c49
-rw-r--r--fs/dlm/recoverd.c35
-rw-r--r--fs/dlm/requestqueue.c20
-rw-r--r--fs/dlm/user.c54
-rw-r--r--fs/dlm/util.c92
-rw-r--r--fs/dlm/util.h8
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c40
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ecryptfs/mmap.c15
-rw-r--r--fs/efivarfs/Makefile2
-rw-r--r--fs/efivarfs/internal.h40
-rw-r--r--fs/efivarfs/super.c15
-rw-r--r--fs/efivarfs/vars.c738
-rw-r--r--fs/efs/inode.c8
-rw-r--r--fs/efs/symlink.c5
-rw-r--r--fs/erofs/Kconfig10
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h2
-rw-r--r--fs/erofs/data.c71
-rw-r--r--fs/erofs/decompressor.c70
-rw-r--r--fs/erofs/decompressor_lzma.c4
-rw-r--r--fs/erofs/dir.c20
-rw-r--r--fs/erofs/erofs_fs.h90
-rw-r--r--fs/erofs/fscache.c671
-rw-r--r--fs/erofs/inode.c42
-rw-r--r--fs/erofs/internal.h142
-rw-r--r--fs/erofs/namei.c18
-rw-r--r--fs/erofs/super.c307
-rw-r--r--fs/erofs/sysfs.c23
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/xattr.h2
-rw-r--r--fs/erofs/zdata.c999
-rw-r--r--fs/erofs/zdata.h159
-rw-r--r--fs/erofs/zmap.c120
-rw-r--r--fs/erofs/zpvec.h159
-rw-r--r--fs/eventfd.c10
-rw-r--r--fs/eventpoll.c22
-rw-r--r--fs/exec.c47
-rw-r--r--fs/exfat/balloc.c8
-rw-r--r--fs/exfat/dir.c6
-rw-r--r--fs/exfat/exfat_fs.h26
-rw-r--r--fs/exfat/fatent.c50
-rw-r--r--fs/exfat/file.c87
-rw-r--r--fs/exfat/inode.c51
-rw-r--r--fs/exfat/misc.c27
-rw-r--r--fs/exfat/namei.c53
-rw-r--r--fs/exfat/nls.c4
-rw-r--r--fs/exfat/super.c23
-rw-r--r--fs/exportfs/expfs.c12
-rw-r--r--fs/ext2/dir.c29
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/inode.c77
-rw-r--r--fs/ext2/namei.c10
-rw-r--r--fs/ext2/super.c53
-rw-r--r--fs/ext2/xattr.c170
-rw-r--r--fs/ext4/Makefile1
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/crypto.c246
-rw-r--r--fs/ext4/dir.c6
-rw-r--r--fs/ext4/ext4.h131
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/extents.c163
-rw-r--r--fs/ext4/extents_status.c6
-rw-r--r--fs/ext4/fast_commit.c269
-rw-r--r--fs/ext4/fast_commit.h3
-rw-r--r--fs/ext4/file.c47
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c4
-rw-r--r--fs/ext4/inline.c92
-rw-r--r--fs/ext4/inode.c243
-rw-r--r--fs/ext4/ioctl.c189
-rw-r--r--fs/ext4/mballoc.c409
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/migrate.c9
-rw-r--r--fs/ext4/mmp.c13
-rw-r--r--fs/ext4/move_extent.c43
-rw-r--r--fs/ext4/namei.c266
-rw-r--r--fs/ext4/orphan.c24
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/readpage.c14
-rw-r--r--fs/ext4/resize.c51
-rw-r--r--fs/ext4/super.c1682
-rw-r--r--fs/ext4/symlink.c66
-rw-r--r--fs/ext4/verity.c15
-rw-r--r--fs/ext4/xattr.c170
-rw-r--r--fs/ext4/xattr.h16
-rw-r--r--fs/f2fs/checkpoint.c22
-rw-r--r--fs/f2fs/compress.c266
-rw-r--r--fs/f2fs/data.c415
-rw-r--r--fs/f2fs/debug.c20
-rw-r--r--fs/f2fs/dir.c3
-rw-r--r--fs/f2fs/f2fs.h298
-rw-r--r--fs/f2fs/file.c412
-rw-r--r--fs/f2fs/gc.c195
-rw-r--r--fs/f2fs/gc.h21
-rw-r--r--fs/f2fs/hash.c11
-rw-r--r--fs/f2fs/inline.c29
-rw-r--r--fs/f2fs/inode.c38
-rw-r--r--fs/f2fs/iostat.c31
-rw-r--r--fs/f2fs/namei.c55
-rw-r--r--fs/f2fs/node.c53
-rw-r--r--fs/f2fs/node.h1
-rw-r--r--fs/f2fs/recovery.c10
-rw-r--r--fs/f2fs/segment.c616
-rw-r--r--fs/f2fs/segment.h43
-rw-r--r--fs/f2fs/super.c230
-rw-r--r--fs/f2fs/sysfs.c56
-rw-r--r--fs/f2fs/verity.c11
-rw-r--r--fs/fat/dir.c8
-rw-r--r--fs/fat/fat.h14
-rw-r--r--fs/fat/fatent.c7
-rw-r--r--fs/fat/file.c33
-rw-r--r--fs/fat/inode.c39
-rw-r--r--fs/fat/misc.c78
-rw-r--r--fs/fat/namei_vfat.c235
-rw-r--r--fs/fcntl.c10
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file.c113
-rw-r--r--fs/file_table.c36
-rw-r--r--fs/freevxfs/vxfs.h27
-rw-r--r--fs/freevxfs/vxfs_bmap.c26
-rw-r--r--fs/freevxfs/vxfs_dir.h27
-rw-r--r--fs/freevxfs/vxfs_extern.h27
-rw-r--r--fs/freevxfs/vxfs_fshead.c26
-rw-r--r--fs/freevxfs/vxfs_fshead.h27
-rw-r--r--fs/freevxfs/vxfs_immed.c76
-rw-r--r--fs/freevxfs/vxfs_inode.c26
-rw-r--r--fs/freevxfs/vxfs_inode.h27
-rw-r--r--fs/freevxfs/vxfs_lookup.c26
-rw-r--r--fs/freevxfs/vxfs_olt.c26
-rw-r--r--fs/freevxfs/vxfs_olt.h27
-rw-r--r--fs/freevxfs/vxfs_subr.c49
-rw-r--r--fs/freevxfs/vxfs_super.c26
-rw-r--r--fs/fs-writeback.c99
-rw-r--r--fs/fscache/Kconfig3
-rw-r--r--fs/fscache/cache.c2
-rw-r--r--fs/fscache/cookie.c35
-rw-r--r--fs/fscache/internal.h4
-rw-r--r--fs/fscache/io.c5
-rw-r--r--fs/fscache/volume.c4
-rw-r--r--fs/fsopen.c4
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/dax.c6
-rw-r--r--fs/fuse/dev.c7
-rw-r--r--fs/fuse/dir.c26
-rw-r--r--fs/fuse/file.c58
-rw-r--r--fs/fuse/inode.c16
-rw-r--r--fs/fuse/ioctl.c15
-rw-r--r--fs/fuse/virtio_fs.c15
-rw-r--r--fs/gfs2/aops.c109
-rw-r--r--fs/gfs2/bmap.c16
-rw-r--r--fs/gfs2/dir.c7
-rw-r--r--fs/gfs2/export.c6
-rw-r--r--fs/gfs2/file.c152
-rw-r--r--fs/gfs2/glock.c237
-rw-r--r--fs/gfs2/glock.h14
-rw-r--r--fs/gfs2/glops.c31
-rw-r--r--fs/gfs2/incore.h6
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/log.c9
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c32
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/main.c13
-rw-r--r--fs/gfs2/meta_io.c22
-rw-r--r--fs/gfs2/meta_io.h8
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/gfs2/quota.c92
-rw-r--r--fs/gfs2/recovery.c22
-rw-r--r--fs/gfs2/rgrp.c23
-rw-r--r--fs/gfs2/rgrp.h5
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/bnode.c4
-rw-r--r--fs/hfs/extent.c6
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c38
-rw-r--r--fs/hfsplus/bnode.c4
-rw-r--r--fs/hfsplus/extents.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h4
-rw-r--r--fs/hfsplus/inode.c38
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hfsplus/wrapper.c12
-rw-r--r--fs/hostfs/hostfs_kern.c15
-rw-r--r--fs/hpfs/file.c10
-rw-r--r--fs/hpfs/namei.c5
-rw-r--r--fs/hugetlbfs/inode.c224
-rw-r--r--fs/inode.c223
-rw-r--r--fs/internal.h47
-rw-r--r--fs/io-wq.c1424
-rw-r--r--fs/io-wq.h227
-rw-r--r--fs/io_uring.c12038
-rw-r--r--fs/iomap/buffered-io.c170
-rw-r--r--fs/iomap/direct-io.c66
-rw-r--r--fs/iomap/trace.h3
-rw-r--r--fs/isofs/compress.c7
-rw-r--r--fs/isofs/inode.c15
-rw-r--r--fs/isofs/rock.c7
-rw-r--r--fs/jbd2/checkpoint.c6
-rw-r--r--fs/jbd2/commit.c64
-rw-r--r--fs/jbd2/journal.c95
-rw-r--r--fs/jbd2/recovery.c35
-rw-r--r--fs/jbd2/revoke.c8
-rw-r--r--fs/jbd2/transaction.c62
-rw-r--r--fs/jffs2/erase.c6
-rw-r--r--fs/jffs2/file.c23
-rw-r--r--fs/jffs2/fs.c3
-rw-r--r--fs/jffs2/gc.c2
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/wbuf.c6
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/inode.c45
-rw-r--r--fs/jfs/ioctl.c5
-rw-r--r--fs/jfs/jfs_dmap.c71
-rw-r--r--fs/jfs/jfs_dtree.c298
-rw-r--r--fs/jfs/jfs_extent.c255
-rw-r--r--fs/jfs/jfs_logmgr.c8
-rw-r--r--fs/jfs/jfs_metapage.c23
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_txnmgr.c34
-rw-r--r--fs/jfs/jfs_xtree.c961
-rw-r--r--fs/jfs/jfs_xtree.h4
-rw-r--r--fs/jfs/super.c8
-rw-r--r--fs/kernel_read_file.c38
-rw-r--r--fs/kernfs/dir.c152
-rw-r--r--fs/kernfs/file.c249
-rw-r--r--fs/kernfs/kernfs-internal.h5
-rw-r--r--fs/kernfs/mount.c19
-rw-r--r--fs/ksmbd/auth.c71
-rw-r--r--fs/ksmbd/auth.h14
-rw-r--r--fs/ksmbd/connection.c39
-rw-r--r--fs/ksmbd/connection.h39
-rw-r--r--fs/ksmbd/ksmbd_netlink.h8
-rw-r--r--fs/ksmbd/mgmt/share_config.c56
-rw-r--r--fs/ksmbd/mgmt/share_config.h7
-rw-r--r--fs/ksmbd/mgmt/tree_connect.c23
-rw-r--r--fs/ksmbd/mgmt/tree_connect.h4
-rw-r--r--fs/ksmbd/mgmt/user_session.c95
-rw-r--r--fs/ksmbd/mgmt/user_session.h13
-rw-r--r--fs/ksmbd/misc.c94
-rw-r--r--fs/ksmbd/misc.h6
-rw-r--r--fs/ksmbd/ndr.c8
-rw-r--r--fs/ksmbd/oplock.c103
-rw-r--r--fs/ksmbd/oplock.h2
-rw-r--r--fs/ksmbd/server.c12
-rw-r--r--fs/ksmbd/smb2misc.c14
-rw-r--r--fs/ksmbd/smb2pdu.c483
-rw-r--r--fs/ksmbd/smb2pdu.h7
-rw-r--r--fs/ksmbd/smb_common.c10
-rw-r--r--fs/ksmbd/smb_common.h2
-rw-r--r--fs/ksmbd/smbacl.c143
-rw-r--r--fs/ksmbd/smbacl.h20
-rw-r--r--fs/ksmbd/transport_ipc.c4
-rw-r--r--fs/ksmbd/transport_rdma.c381
-rw-r--r--fs/ksmbd/transport_rdma.h8
-rw-r--r--fs/ksmbd/transport_tcp.c5
-rw-r--r--fs/ksmbd/unicode.h3
-rw-r--r--fs/ksmbd/vfs.c78
-rw-r--r--fs/ksmbd/vfs.h6
-rw-r--r--fs/ksmbd/vfs_cache.c4
-rw-r--r--fs/ksmbd/vfs_cache.h1
-rw-r--r--fs/libfs.c18
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/svc4proc.c36
-rw-r--r--fs/lockd/svclock.c10
-rw-r--r--fs/lockd/svcproc.c29
-rw-r--r--fs/lockd/svcsubs.c14
-rw-r--r--fs/lockd/xdr4.c19
-rw-r--r--fs/locks.c139
-rw-r--r--fs/mbcache.c128
-rw-r--r--fs/minix/inode.c11
-rw-r--r--fs/mount.h1
-rw-r--r--fs/mpage.c143
-rw-r--r--fs/namei.c414
-rw-r--r--fs/namespace.c30
-rw-r--r--fs/netfs/buffered_read.c51
-rw-r--r--fs/netfs/internal.h2
-rw-r--r--fs/netfs/objects.c8
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c13
-rw-r--r--fs/nfs/blocklayout/dev.c42
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c114
-rw-r--r--fs/nfs/direct.c81
-rw-r--r--fs/nfs/file.c140
-rw-r--r--fs/nfs/filelayout/filelayout.c9
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c4
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c6
-rw-r--r--fs/nfs/fs_context.c28
-rw-r--r--fs/nfs/fscache.c7
-rw-r--r--fs/nfs/fscache.h14
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h93
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs42proc.c11
-rw-r--r--fs/nfs/nfs42xattr.c9
-rw-r--r--fs/nfs/nfs42xdr.c170
-rw-r--r--fs/nfs/nfs4client.c4
-rw-r--r--fs/nfs/nfs4file.c17
-rw-r--r--fs/nfs/nfs4idmap.c46
-rw-r--r--fs/nfs/nfs4namespace.c9
-rw-r--r--fs/nfs/nfs4proc.c239
-rw-r--r--fs/nfs/nfs4state.c30
-rw-r--r--fs/nfs/nfs4xdr.c99
-rw-r--r--fs/nfs/nfstrace.h215
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c24
-rw-r--r--fs/nfs/pnfs.h1
-rw-r--r--fs/nfs/read.c7
-rw-r--r--fs/nfs/super.c29
-rw-r--r--fs/nfs/symlink.c16
-rw-r--r--fs/nfs/unlink.c9
-rw-r--r--fs/nfs/write.c159
-rw-r--r--fs/nfsd/acl.h6
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/filecache.c840
-rw-r--r--fs/nfsd/filecache.h15
-rw-r--r--fs/nfsd/netns.h7
-rw-r--r--fs/nfsd/nfs2acl.c35
-rw-r--r--fs/nfsd/nfs3acl.c7
-rw-r--r--fs/nfsd/nfs3proc.c213
-rw-r--r--fs/nfsd/nfs3xdr.c18
-rw-r--r--fs/nfsd/nfs4acl.c46
-rw-r--r--fs/nfsd/nfs4callback.c51
-rw-r--r--fs/nfsd/nfs4idmap.c8
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c810
-rw-r--r--fs/nfsd/nfs4recover.c22
-rw-r--r--fs/nfsd/nfs4state.c680
-rw-r--r--fs/nfsd/nfs4xdr.c236
-rw-r--r--fs/nfsd/nfscache.c18
-rw-r--r--fs/nfsd/nfsctl.c79
-rw-r--r--fs/nfsd/nfsd.h23
-rw-r--r--fs/nfsd/nfsfh.c35
-rw-r--r--fs/nfsd/nfsfh.h58
-rw-r--r--fs/nfsd/nfsproc.c64
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/nfsxdr.c4
-rw-r--r--fs/nfsd/state.h43
-rw-r--r--fs/nfsd/stats.c14
-rw-r--r--fs/nfsd/trace.h478
-rw-r--r--fs/nfsd/vfs.c657
-rw-r--r--fs/nfsd/vfs.h47
-rw-r--r--fs/nfsd/xdr4.h70
-rw-r--r--fs/nilfs2/btnode.c8
-rw-r--r--fs/nilfs2/btnode.h4
-rw-r--r--fs/nilfs2/btree.c6
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/gcinode.c7
-rw-r--r--fs/nilfs2/inode.c27
-rw-r--r--fs/nilfs2/ioctl.c6
-rw-r--r--fs/nilfs2/mdt.c19
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/nilfs2/page.c60
-rw-r--r--fs/nilfs2/recovery.c2
-rw-r--r--fs/nilfs2/sufile.c4
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/notify/dnotify/dnotify.c13
-rw-r--r--fs/notify/fanotify/fanotify.c39
-rw-r--r--fs/notify/fanotify/fanotify.h22
-rw-r--r--fs/notify/fanotify/fanotify_user.c215
-rw-r--r--fs/notify/fdinfo.c27
-rw-r--r--fs/notify/fsnotify.c108
-rw-r--r--fs/notify/fsnotify.h4
-rw-r--r--fs/notify/group.c32
-rw-r--r--fs/notify/inotify/inotify.h19
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c49
-rw-r--r--fs/notify/mark.c112
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/aops.c52
-rw-r--r--fs/ntfs/aops.h13
-rw-r--r--fs/ntfs/attrib.c10
-rw-r--r--fs/ntfs/compress.c6
-rw-r--r--fs/ntfs/file.c17
-rw-r--r--fs/ntfs/inode.c4
-rw-r--r--fs/ntfs/logfile.c2
-rw-r--r--fs/ntfs/mft.c4
-rw-r--r--fs/ntfs/mft.h2
-rw-r--r--fs/ntfs/super.c3
-rw-r--r--fs/ntfs3/attrib.c557
-rw-r--r--fs/ntfs3/bitmap.c12
-rw-r--r--fs/ntfs3/file.c131
-rw-r--r--fs/ntfs3/frecord.c138
-rw-r--r--fs/ntfs3/fslog.c16
-rw-r--r--fs/ntfs3/fsntfs.c94
-rw-r--r--fs/ntfs3/index.c33
-rw-r--r--fs/ntfs3/inode.c67
-rw-r--r--fs/ntfs3/namei.c6
-rw-r--r--fs/ntfs3/ntfs_fs.h30
-rw-r--r--fs/ntfs3/record.c27
-rw-r--r--fs/ntfs3/run.c108
-rw-r--r--fs/ntfs3/super.c37
-rw-r--r--fs/ntfs3/xattr.c93
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c49
-rw-r--r--fs/ocfs2/buffer_head_io.c8
-rw-r--r--fs/ocfs2/cluster/heartbeat.c75
-rw-r--r--fs/ocfs2/dir.c10
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c12
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c21
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c17
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/heartbeat.c27
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/ioctl.c5
-rw-r--r--fs/ocfs2/journal.c47
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/namei.c1
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c10
-rw-r--r--fs/ocfs2/refcounttree.c48
-rw-r--r--fs/ocfs2/reservations.c4
-rw-r--r--fs/ocfs2/reservations.h9
-rw-r--r--fs/ocfs2/slot_map.c46
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c206
-rw-r--r--fs/ocfs2/symlink.c5
-rw-r--r--fs/omfs/file.c11
-rw-r--r--fs/open.c151
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c52
-rw-r--r--fs/overlayfs/copy_up.c105
-rw-r--r--fs/overlayfs/dir.c147
-rw-r--r--fs/overlayfs/export.c7
-rw-r--r--fs/overlayfs/file.c56
-rw-r--r--fs/overlayfs/inode.c168
-rw-r--r--fs/overlayfs/namei.c57
-rw-r--r--fs/overlayfs/overlayfs.h237
-rw-r--r--fs/overlayfs/ovl_entry.h7
-rw-r--r--fs/overlayfs/readdir.c88
-rw-r--r--fs/overlayfs/super.c93
-rw-r--r--fs/overlayfs/util.c105
-rw-r--r--fs/pipe.c40
-rw-r--r--fs/posix_acl.c363
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/base.c72
-rw-r--r--fs/proc/cpuinfo.c6
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/proc/generic.c3
-rw-r--r--fs/proc/inode.c22
-rw-r--r--fs/proc/kcore.c14
-rw-r--r--fs/proc/kmsg.c1
-rw-r--r--fs/proc/meminfo.c9
-rw-r--r--fs/proc/nommu.c1
-rw-r--r--fs/proc/proc_net.c12
-rw-r--r--fs/proc/proc_sysctl.c93
-rw-r--r--fs/proc/proc_tty.c2
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/task_mmu.c23
-rw-r--r--fs/proc/vmcore.c131
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/pstore/inode.c1
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/pstore/zone.c12
-rw-r--r--fs/qnx4/inode.c7
-rw-r--r--fs/qnx6/inode.c6
-rw-r--r--fs/quota/dquot.c29
-rw-r--r--fs/quota/quota_tree.c73
-rw-r--r--fs/read_write.c145
-rw-r--r--fs/readdir.c68
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c56
-rw-r--r--fs/reiserfs/journal.c26
-rw-r--r--fs/reiserfs/prints.c2
-rw-r--r--fs/reiserfs/resize.c2
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c9
-rw-r--r--fs/reiserfs/xattr.c29
-rw-r--r--fs/remap_range.c45
-rw-r--r--fs/romfs/super.c9
-rw-r--r--fs/seq_file.c32
-rw-r--r--fs/smbfs_common/smb2pdu.h108
-rw-r--r--fs/smbfs_common/smbfsctl.h6
-rw-r--r--fs/splice.c64
-rw-r--r--fs/squashfs/Makefile4
-rw-r--r--fs/squashfs/block.c30
-rw-r--r--fs/squashfs/decompressor.h1
-rw-r--r--fs/squashfs/file.c151
-rw-r--r--fs/squashfs/file_direct.c90
-rw-r--r--fs/squashfs/lz4_wrapper.c7
-rw-r--r--fs/squashfs/lzo_wrapper.c7
-rw-r--r--fs/squashfs/page_actor.c53
-rw-r--r--fs/squashfs/page_actor.h62
-rw-r--r--fs/squashfs/super.c33
-rw-r--r--fs/squashfs/symlink.c5
-rw-r--r--fs/squashfs/xz_wrapper.c11
-rw-r--r--fs/squashfs/zlib_wrapper.c12
-rw-r--r--fs/squashfs/zstd_wrapper.c12
-rw-r--r--fs/stat.c35
-rw-r--r--fs/super.c43
-rw-r--r--fs/sync.c9
-rw-r--r--fs/sysfs/file.c13
-rw-r--r--fs/sysv/itree.c10
-rw-r--r--fs/sysv/super.c4
-rw-r--r--fs/tracefs/inode.c33
-rw-r--r--fs/ubifs/budget.c7
-rw-r--r--fs/ubifs/file.c68
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/dir.c2
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c15
-rw-r--r--fs/udf/inode.c12
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/udf/symlink.c5
-rw-r--r--fs/ufs/balloc.c2
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/inode.c13
-rw-r--r--fs/ufs/util.c11
-rw-r--r--fs/userfaultfd.c58
-rw-r--r--fs/vboxsf/file.c5
-rw-r--r--fs/verity/Kconfig11
-rw-r--r--fs/verity/enable.c33
-rw-r--r--fs/verity/fsverity_private.h15
-rw-r--r--fs/verity/measure.c43
-rw-r--r--fs/verity/open.c12
-rw-r--r--fs/verity/read_metadata.c11
-rw-r--r--fs/verity/verify.c14
-rw-r--r--fs/xattr.c153
-rw-r--r--fs/xfs/Makefile7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c174
-rw-r--r--fs/xfs/libxfs/xfs_ag.h75
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c157
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h60
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1681
-rw-r--r--fs/xfs/libxfs/xfs_attr.h190
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c109
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c52
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c251
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h58
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c19
-rw-r--r--fs/xfs/libxfs/xfs_btree.c188
-rw-r--r--fs/xfs/libxfs/xfs_btree.h26
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h28
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h9
-rw-r--r--fs/xfs/libxfs/xfs_defer.c97
-rw-r--r--fs/xfs/libxfs/xfs_defer.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c8
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h8
-rw-r--r--fs/xfs/libxfs/xfs_format.h191
-rw-r--r--fs/xfs/libxfs/xfs_fs.h41
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c94
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h27
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c20
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c129
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c114
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h103
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h87
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h16
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c75
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c33
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h13
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c169
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c9
-rw-r--r--fs/xfs/libxfs/xfs_sb.c80
-rw-r--r--fs/xfs/libxfs/xfs_shared.h24
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c227
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h16
-rw-r--r--fs/xfs/libxfs/xfs_types.c73
-rw-r--r--fs/xfs/libxfs/xfs_types.h20
-rw-r--r--fs/xfs/scrub/agheader.c25
-rw-r--r--fs/xfs/scrub/agheader_repair.c21
-rw-r--r--fs/xfs/scrub/alloc.c7
-rw-r--r--fs/xfs/scrub/bmap.c42
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c8
-rw-r--r--fs/xfs/scrub/dabtree.c2
-rw-r--r--fs/xfs/scrub/dir.c10
-rw-r--r--fs/xfs/scrub/fscounters.c4
-rw-r--r--fs/xfs/scrub/health.c2
-rw-r--r--fs/xfs/scrub/ialloc.c12
-rw-r--r--fs/xfs/scrub/inode.c20
-rw-r--r--fs/xfs/scrub/parent.c4
-rw-r--r--fs/xfs/scrub/quota.c2
-rw-r--r--fs/xfs/scrub/refcount.c9
-rw-r--r--fs/xfs/scrub/repair.c49
-rw-r--r--fs/xfs/scrub/rmap.c6
-rw-r--r--fs/xfs/scrub/rtbitmap.c9
-rw-r--r--fs/xfs/scrub/scrub.c17
-rw-r--r--fs/xfs/scrub/symlink.c6
-rw-r--r--fs/xfs/xfs_acl.c7
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_aops.c16
-rw-r--r--fs/xfs/xfs_attr_inactive.c23
-rw-r--r--fs/xfs/xfs_attr_item.c888
-rw-r--r--fs/xfs/xfs_attr_item.h54
-rw-r--r--fs/xfs/xfs_attr_list.c10
-rw-r--r--fs/xfs/xfs_bio_io.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c27
-rw-r--r--fs/xfs/xfs_bmap_util.c66
-rw-r--r--fs/xfs/xfs_buf.c311
-rw-r--r--fs/xfs/xfs_buf.h69
-rw-r--r--fs/xfs/xfs_buf_item.h24
-rw-r--r--fs/xfs/xfs_buf_item_recover.c66
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_discard.c10
-rw-r--r--fs/xfs/xfs_dquot.c20
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_error.c9
-rw-r--r--fs/xfs/xfs_error.h20
-rw-r--r--fs/xfs/xfs_extfree_item.c41
-rw-r--r--fs/xfs/xfs_file.c100
-rw-r--r--fs/xfs/xfs_filestream.c11
-rw-r--r--fs/xfs/xfs_fsmap.c9
-rw-r--r--fs/xfs/xfs_fsops.c30
-rw-r--r--fs/xfs/xfs_globals.c1
-rw-r--r--fs/xfs/xfs_icache.c81
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_icreate_item.c1
-rw-r--r--fs/xfs/xfs_inode.c859
-rw-r--r--fs/xfs/xfs_inode.h99
-rw-r--r--fs/xfs/xfs_inode_item.c96
-rw-r--r--fs/xfs/xfs_inode_item_recover.c145
-rw-r--r--fs/xfs/xfs_ioctl.c23
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c82
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c44
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_itable.c19
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_iunlink_item.c180
-rw-r--r--fs/xfs/xfs_iunlink_item.h27
-rw-r--r--fs/xfs/xfs_iwalk.h2
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c836
-rw-r--r--fs/xfs/xfs_log.h100
-rw-r--r--fs/xfs/xfs_log_cil.c805
-rw-r--r--fs/xfs/xfs_log_priv.h148
-rw-r--r--fs/xfs/xfs_log_recover.c293
-rw-r--r--fs/xfs/xfs_message.c58
-rw-r--r--fs/xfs/xfs_message.h61
-rw-r--r--fs/xfs/xfs_mount.c95
-rw-r--r--fs/xfs/xfs_mount.h53
-rw-r--r--fs/xfs/xfs_notify_failure.c226
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_qm.c35
-rw-r--r--fs/xfs/xfs_qm.h5
-rw-r--r--fs/xfs/xfs_qm_syscalls.c35
-rw-r--r--fs/xfs/xfs_quotaops.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c25
-rw-r--r--fs/xfs/xfs_reflink.c356
-rw-r--r--fs/xfs/xfs_reflink.h3
-rw-r--r--fs/xfs/xfs_rmap_item.c25
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rtalloc.h9
-rw-r--r--fs/xfs/xfs_super.c108
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h104
-rw-r--r--fs/xfs/xfs_trans.c147
-rw-r--r--fs/xfs/xfs_trans.h47
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_xattr.c96
-rw-r--r--fs/xfs/xfs_xattr.h13
-rw-r--r--fs/zonefs/Makefile2
-rw-r--r--fs/zonefs/super.c394
-rw-r--r--fs/zonefs/sysfs.c139
-rw-r--r--fs/zonefs/trace.h4
-rw-r--r--fs/zonefs/zonefs.h18
934 files changed, 49948 insertions, 49840 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 1c8dc696d516..cebba4eaa0b5 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,12 +62,12 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
version = cpu_to_le32(v9inode->qid.version);
path = cpu_to_le64(v9inode->qid.path);
v9ses = v9fs_inode2v9ses(inode);
- v9inode->netfs_ctx.cache =
+ v9inode->netfs.cache =
fscache_acquire_cookie(v9fs_session_cache(v9ses),
0,
&path, sizeof(path),
&version, sizeof(version),
- i_size_read(&v9inode->vfs_inode));
+ i_size_read(&v9inode->netfs.inode));
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9fs_inode_cookie(v9inode));
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 79df61fe0e59..23cf9b2fbfe4 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -28,14 +28,18 @@ static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
/**
* v9fs_fid_add - add a fid to a dentry
* @dentry: dentry that the fid is being added to
- * @fid: fid to add
+ * @pfid: fid to add, NULLed out
*
*/
-void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid)
{
+ struct p9_fid *fid = *pfid;
+
spin_lock(&dentry->d_lock);
__add_fid(dentry, fid);
spin_unlock(&dentry->d_lock);
+
+ *pfid = NULL;
}
/**
@@ -56,7 +60,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
h = (struct hlist_head *)&inode->i_private;
hlist_for_each_entry(fid, h, ilist) {
if (uid_eq(fid->uid, uid)) {
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
ret = fid;
break;
}
@@ -68,15 +72,19 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid)
/**
* v9fs_open_fid_add - add an open fid to an inode
* @inode: inode that the fid is being added to
- * @fid: fid to add
+ * @pfid: fid to add, NULLed out
*
*/
-void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid)
+void v9fs_open_fid_add(struct inode *inode, struct p9_fid **pfid)
{
+ struct p9_fid *fid = *pfid;
+
spin_lock(&inode->i_lock);
hlist_add_head(&fid->ilist, (struct hlist_head *)&inode->i_private);
spin_unlock(&inode->i_lock);
+
+ *pfid = NULL;
}
@@ -104,7 +112,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
hlist_for_each_entry(fid, h, dlist) {
if (any || uid_eq(fid->uid, uid)) {
ret = fid;
- refcount_inc(&ret->count);
+ p9_fid_get(ret);
break;
}
}
@@ -150,9 +158,9 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
{
struct dentry *ds;
const unsigned char **wnames, *uname;
- int i, n, l, clone, access;
+ int i, n, l, access;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *old_fid = NULL;
+ struct p9_fid *fid, *root_fid, *old_fid;
v9ses = v9fs_dentry2v9ses(dentry);
access = v9ses->flags & V9FS_ACCESS_MASK;
@@ -169,17 +177,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = v9fs_fid_find(ds, uid, any);
if (fid) {
/* Found the parent fid do a lookup with that */
- struct p9_fid *ofid = fid;
+ old_fid = fid;
- fid = p9_client_walk(ofid, 1, &dentry->d_name.name, 1);
- p9_client_clunk(ofid);
+ fid = p9_client_walk(old_fid, 1, &dentry->d_name.name, 1);
+ p9_fid_put(old_fid);
goto fid_out;
}
up_read(&v9ses->rename_sem);
/* start from the root and try to do a lookup */
- fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
- if (!fid) {
+ root_fid = v9fs_fid_find(dentry->d_sb->s_root, uid, any);
+ if (!root_fid) {
/* the user is not attached to the fs yet */
if (access == V9FS_ACCESS_SINGLE)
return ERR_PTR(-EPERM);
@@ -194,13 +202,13 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
if (IS_ERR(fid))
return fid;
- v9fs_fid_add(dentry->d_sb->s_root, fid);
+ root_fid = p9_fid_get(fid);
+ v9fs_fid_add(dentry->d_sb->s_root, &fid);
}
/* If we are root ourself just return that */
- if (dentry->d_sb->s_root == dentry) {
- refcount_inc(&fid->count);
- return fid;
- }
+ if (dentry->d_sb->s_root == dentry)
+ return root_fid;
+
/*
* Do a multipath walk with attached root.
* When walking parent we need to make sure we
@@ -212,30 +220,27 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = ERR_PTR(n);
goto err_out;
}
- clone = 1;
+ fid = root_fid;
+ old_fid = root_fid;
i = 0;
while (i < n) {
l = min(n - i, P9_MAXWELEM);
/*
* We need to hold rename lock when doing a multipath
- * walk to ensure none of the patch component change
+ * walk to ensure none of the path components change
*/
- fid = p9_client_walk(fid, l, &wnames[i], clone);
+ fid = p9_client_walk(old_fid, l, &wnames[i],
+ old_fid == root_fid /* clone */);
+ /* non-cloning walk will return the same fid */
+ if (fid != old_fid) {
+ p9_fid_put(old_fid);
+ old_fid = fid;
+ }
if (IS_ERR(fid)) {
- if (old_fid) {
- /*
- * If we fail, clunk fid which are mapping
- * to path component and not the last component
- * of the path.
- */
- p9_client_clunk(old_fid);
- }
kfree(wnames);
goto err_out;
}
- old_fid = fid;
i += l;
- clone = 0;
}
kfree(wnames);
fid_out:
@@ -243,11 +248,11 @@ fid_out:
spin_lock(&dentry->d_lock);
if (d_unhashed(dentry)) {
spin_unlock(&dentry->d_lock);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
fid = ERR_PTR(-ENOENT);
} else {
__add_fid(dentry, fid);
- refcount_inc(&fid->count);
+ p9_fid_get(fid);
spin_unlock(&dentry->d_lock);
}
}
@@ -304,7 +309,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
fid = clone_fid(ofid);
if (IS_ERR(fid))
goto error_out;
- p9_client_clunk(ofid);
+ p9_fid_put(ofid);
/*
* writeback fid will only be used to write back the
* dirty pages. We always request for the open fid in read-write
@@ -313,7 +318,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
*/
err = p9_client_open(fid, O_RDWR);
if (err < 0) {
- p9_client_clunk(fid);
+ p9_fid_put(fid);
fid = ERR_PTR(err);
goto error_out;
}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index f7f33509e169..8a4e8cd12ca2 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -13,9 +13,9 @@ static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry)
{
return v9fs_fid_lookup(dentry->d_parent);
}
-void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid);
struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
-void v9fs_open_fid_add(struct inode *inode, struct p9_fid *fid);
+void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid);
static inline struct p9_fid *clone_fid(struct p9_fid *fid)
{
return IS_ERR(fid) ? fid : p9_client_walk(fid, 0, NULL, 1);
@@ -29,7 +29,7 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
return fid;
nfid = clone_fid(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return nfid;
}
#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index e28ddf763b3b..0129de2ea31a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -625,7 +625,7 @@ static void v9fs_inode_init_once(void *foo)
struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
memset(&v9inode->qid, 0, sizeof(v9inode->qid));
- inode_init_once(&v9inode->vfs_inode);
+ inode_init_once(&v9inode->netfs.inode);
}
/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index ec0e8df3b2eb..6acabc2e7dc9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -109,11 +109,7 @@ struct v9fs_session_info {
#define V9FS_INO_INVALID_ATTR 0x01
struct v9fs_inode {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct p9_qid qid;
unsigned int cache_validity;
struct p9_fid *writeback_fid;
@@ -122,13 +118,13 @@ struct v9fs_inode {
static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
{
- return container_of(inode, struct v9fs_inode, vfs_inode);
+ return container_of(inode, struct v9fs_inode, netfs.inode);
}
static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
{
#ifdef CONFIG_9P_FSCACHE
- return netfs_i_cookie(&v9inode->vfs_inode);
+ return netfs_i_cookie(&v9inode->netfs);
#else
return NULL;
#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 501128188343..47b9a1122f34 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -58,23 +58,35 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
+ struct inode *inode = file_inode(file);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid = file->private_data;
- refcount_inc(&fid->count);
+ BUG_ON(!fid);
+
+ /* we might need to read from a fid that was opened write-only
+ * for read-modify-write of page cache, use the writeback fid
+ * for that */
+ if (rreq->origin == NETFS_READ_FOR_WRITE &&
+ (fid->mode & O_ACCMODE) == O_WRONLY) {
+ fid = v9inode->writeback_fid;
+ BUG_ON(!fid);
+ }
+
+ p9_fid_get(fid);
rreq->netfs_priv = fid;
return 0;
}
/**
- * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request
- * @mapping: unused mapping of request to cleanup
- * @priv: private data to cleanup, a fid, guaranted non-null.
+ * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq
+ * @rreq: The I/O request to clean up
*/
-static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
+static void v9fs_free_request(struct netfs_io_request *rreq)
{
- struct p9_fid *fid = priv;
+ struct p9_fid *fid = rreq->netfs_priv;
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
/**
@@ -94,35 +106,34 @@ static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
+ .free_request = v9fs_free_request,
.begin_cache_operation = v9fs_begin_cache_operation,
.issue_read = v9fs_issue_read,
- .cleanup = v9fs_req_cleanup,
};
/**
- * v9fs_release_page - release the private state associated with a page
- * @page: The page to be released
+ * v9fs_release_folio - release the private state associated with a folio
+ * @folio: The folio to be released
* @gfp: The caller's allocation restrictions
*
- * Returns 1 if the page can be released, false otherwise.
+ * Returns true if the page can be released, false otherwise.
*/
-static int v9fs_release_page(struct page *page, gfp_t gfp)
+static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct folio *folio = page_folio(page);
struct inode *inode = folio_inode(folio);
if (folio_test_private(folio))
- return 0;
+ return false;
#ifdef CONFIG_9P_FSCACHE
if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
- return 0;
+ return false;
folio_wait_fscache(folio);
}
#endif
fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode)));
- return 1;
+ return true;
}
static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
@@ -141,7 +152,7 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
transferred_or_error != -ENOBUFS) {
version = cpu_to_le32(v9inode->qid.version);
fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->vfs_inode), 0);
+ i_size_read(&v9inode->netfs.inode), 0);
}
}
@@ -260,7 +271,7 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int flags,
+ loff_t pos, unsigned int len,
struct page **subpagep, void **fsdata)
{
int retval;
@@ -275,7 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata);
+ retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
if (retval < 0)
return retval;
@@ -336,13 +347,13 @@ static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
#endif
const struct address_space_operations v9fs_addr_operations = {
- .readpage = netfs_readpage,
+ .read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.dirty_folio = v9fs_dirty_folio,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
.write_end = v9fs_write_end,
- .releasepage = v9fs_release_page,
+ .release_folio = v9fs_release_folio,
.invalidate_folio = v9fs_invalidate_folio,
.launder_folio = v9fs_launder_folio,
.direct_IO = v9fs_direct_IO,
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 1c609e99d280..f89f01734587 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -54,7 +54,7 @@ static void v9fs_dentry_release(struct dentry *dentry)
p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
dentry, dentry);
hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
- p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+ p9_fid_put(hlist_entry(p, struct p9_fid, dlist));
dentry->d_fsdata = NULL;
}
@@ -85,7 +85,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
retval = v9fs_refresh_inode_dotl(fid, inode);
else
retval = v9fs_refresh_inode(fid, inode);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval == -ENOENT)
return 0;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 958680f7f23e..000fbaae9b18 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -218,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
spin_lock(&inode->i_lock);
hlist_del(&fid->ilist);
spin_unlock(&inode->i_lock);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
if ((filp->f_mode & FMODE_WRITE)) {
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2573c08f335c..aec43ba83799 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -63,15 +63,16 @@ int v9fs_file_open(struct inode *inode, struct file *file)
err = p9_client_open(fid, omode);
if (err < 0) {
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return err;
}
if ((file->f_flags & O_APPEND) &&
(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
generic_file_llseek(file, 0, SEEK_END);
+
+ file->private_data = fid;
}
- file->private_data = fid;
mutex_lock(&v9inode->v_mutex);
if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
!v9inode->writeback_fid &&
@@ -95,10 +96,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, fid);
+ v9fs_open_fid_add(inode, &fid);
return 0;
out_error:
- p9_client_clunk(file->private_data);
+ p9_fid_put(file->private_data);
file->private_data = NULL;
return err;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 55367ecb9442..4d1a4a8d9277 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -234,7 +234,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
mutex_init(&v9inode->v_mutex);
- return &v9inode->vfs_inode;
+ return &v9inode->netfs.inode;
}
/**
@@ -252,7 +252,8 @@ void v9fs_free_inode(struct inode *inode)
*/
static void v9fs_set_netfs_context(struct inode *inode)
{
- netfs_i_context_init(inode, &v9fs_req_ops);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
@@ -398,10 +399,8 @@ void v9fs_evict_inode(struct inode *inode)
fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
/* clunk the fid stashed in writeback_fid */
- if (v9inode->writeback_fid) {
- p9_client_clunk(v9inode->writeback_fid);
- v9inode->writeback_fid = NULL;
- }
+ p9_fid_put(v9inode->writeback_fid);
+ v9inode->writeback_fid = NULL;
}
static int v9fs_test_inode(struct inode *inode, void *data)
@@ -568,7 +567,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
if (v9fs_proto_dotl(v9ses))
retval = p9_client_unlinkat(dfid, dentry->d_name.name,
v9fs_at_to_dotl_flags(flags));
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
if (retval == -EOPNOTSUPP) {
/* Try the one based on path */
v9fid = v9fs_fid_clone(dentry);
@@ -632,14 +631,12 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- p9_client_clunk(dfid);
- return ERR_PTR(err);
+ goto error;
}
err = p9_client_fcreate(ofid, name, perm, mode, extension);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
- p9_client_clunk(dfid);
goto error;
}
@@ -650,8 +647,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS,
"p9_client_walk failed %d\n", err);
- fid = NULL;
- p9_client_clunk(dfid);
goto error;
}
/*
@@ -662,21 +657,17 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
"inode creation failed %d\n", err);
- p9_client_clunk(dfid);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
}
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return ofid;
error:
- if (ofid)
- p9_client_clunk(ofid);
-
- if (fid)
- p9_client_clunk(fid);
-
+ p9_fid_put(dfid);
+ p9_fid_put(ofid);
+ p9_fid_put(fid);
return ERR_PTR(err);
}
@@ -707,7 +698,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -743,7 +734,7 @@ static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return err;
}
@@ -784,7 +775,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
*/
name = dentry->d_name.name;
fid = p9_client_walk(dfid, 1, &name, 1);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
if (fid == ERR_PTR(-ENOENT))
inode = NULL;
else if (IS_ERR(fid))
@@ -803,11 +794,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
res = d_splice_alias(inode, dentry);
if (!IS_ERR(fid)) {
if (!res)
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
else if (!IS_ERR(res))
- v9fs_fid_add(res, fid);
+ v9fs_fid_add(res, &fid);
else
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
return res;
}
@@ -846,7 +837,6 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_proto_dotu(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
- fid = NULL;
goto error;
}
@@ -881,7 +871,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, fid);
+ v9fs_open_fid_add(inode, &fid);
file->f_mode |= FMODE_CREATED;
out:
@@ -889,8 +879,7 @@ out:
return err;
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
goto out;
}
@@ -938,9 +927,9 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct inode *old_inode;
struct inode *new_inode;
struct v9fs_session_info *v9ses;
- struct p9_fid *oldfid, *dfid;
- struct p9_fid *olddirfid;
- struct p9_fid *newdirfid;
+ struct p9_fid *oldfid = NULL, *dfid = NULL;
+ struct p9_fid *olddirfid = NULL;
+ struct p9_fid *newdirfid = NULL;
struct p9_wstat wstat;
if (flags)
@@ -957,21 +946,22 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
dfid = v9fs_parent_fid(old_dentry);
olddirfid = clone_fid(dfid);
- if (dfid && !IS_ERR(dfid))
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
+ dfid = NULL;
if (IS_ERR(olddirfid)) {
retval = PTR_ERR(olddirfid);
- goto done;
+ goto error;
}
dfid = v9fs_parent_fid(new_dentry);
newdirfid = clone_fid(dfid);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
+ dfid = NULL;
if (IS_ERR(newdirfid)) {
retval = PTR_ERR(newdirfid);
- goto clunk_olddir;
+ goto error;
}
down_write(&v9ses->rename_sem);
@@ -982,7 +972,7 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
retval = p9_client_rename(oldfid, newdirfid,
new_dentry->d_name.name);
if (retval != -EOPNOTSUPP)
- goto clunk_newdir;
+ goto error_locked;
}
if (old_dentry->d_parent != new_dentry->d_parent) {
/*
@@ -991,14 +981,14 @@ v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
retval = -EXDEV;
- goto clunk_newdir;
+ goto error_locked;
}
v9fs_blank_wstat(&wstat);
wstat.muid = v9ses->uname;
wstat.name = new_dentry->d_name.name;
retval = p9_client_wstat(oldfid, &wstat);
-clunk_newdir:
+error_locked:
if (!retval) {
if (new_inode) {
if (S_ISDIR(new_inode->i_mode))
@@ -1019,13 +1009,11 @@ clunk_newdir:
d_move(old_dentry, new_dentry);
}
up_write(&v9ses->rename_sem);
- p9_client_clunk(newdirfid);
-
-clunk_olddir:
- p9_client_clunk(olddirfid);
-done:
- p9_client_clunk(oldfid);
+error:
+ p9_fid_put(newdirfid);
+ p9_fid_put(olddirfid);
+ p9_fid_put(oldfid);
return retval;
}
@@ -1059,7 +1047,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
return PTR_ERR(fid);
st = p9_client_stat(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return PTR_ERR(st);
@@ -1135,7 +1123,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns,
retval = p9_client_wstat(fid, &wstat);
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval < 0)
return retval;
@@ -1250,17 +1238,17 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
return ERR_PTR(-ECHILD);
v9ses = v9fs_dentry2v9ses(dentry);
- fid = v9fs_fid_lookup(dentry);
+ if (!v9fs_proto_dotu(v9ses))
+ return ERR_PTR(-EBADF);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
- if (!v9fs_proto_dotu(v9ses))
- return ERR_PTR(-EBADF);
-
st = p9_client_stat(fid);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return ERR_CAST(st);
@@ -1307,7 +1295,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
return PTR_ERR(fid);
v9fs_invalidate_inode_attr(dir);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -1363,7 +1351,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
v9fs_refresh_inode(oldfid, d_inode(old_dentry));
v9fs_invalidate_inode_attr(dir);
}
- p9_client_clunk(oldfid);
+ p9_fid_put(oldfid);
return retval;
}
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index d17502a738a9..5cfa4b4f070f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -238,7 +238,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
struct inode *inode;
struct p9_fid *fid = NULL;
struct v9fs_inode *v9inode;
- struct p9_fid *dfid, *ofid, *inode_fid;
+ struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL;
struct v9fs_session_info *v9ses;
struct posix_acl *pacl = NULL, *dacl = NULL;
struct dentry *res = NULL;
@@ -285,36 +285,34 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err) {
p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
err);
- goto error;
+ goto out;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
mode, gid, &qid);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
err);
- goto error;
+ goto out;
}
v9fs_invalidate_inode_attr(dir);
/* instantiate inode and assign the unopened fid to the dentry */
fid = p9_client_walk(dfid, 1, &name, 1);
- p9_client_clunk(dfid);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
- fid = NULL;
- goto error;
+ goto out;
}
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
- goto error;
+ goto out;
}
/* Now set the ACL based on the default value */
v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
v9inode = V9FS_I(inode);
@@ -333,7 +331,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode_fid)) {
err = PTR_ERR(inode_fid);
mutex_unlock(&v9inode->v_mutex);
- goto err_clunk_old_fid;
+ goto out;
}
v9inode->writeback_fid = (void *) inode_fid;
}
@@ -341,25 +339,20 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
/* Since we are opening a file, assign the open fid to the file */
err = finish_open(file, dentry, generic_file_open);
if (err)
- goto err_clunk_old_fid;
+ goto out;
file->private_data = ofid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
- v9fs_open_fid_add(inode, ofid);
+ v9fs_open_fid_add(inode, &ofid);
file->f_mode |= FMODE_CREATED;
out:
+ p9_fid_put(dfid);
+ p9_fid_put(ofid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
dput(res);
return err;
-
-error:
- if (fid)
- p9_client_clunk(fid);
-err_clunk_old_fid:
- if (ofid)
- p9_client_clunk(ofid);
- goto out;
}
/**
@@ -397,7 +390,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
- dfid = NULL;
goto error;
}
@@ -419,7 +411,6 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -432,10 +423,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
err);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
v9fs_set_create_acl(inode, fid, dacl, pacl);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/*
@@ -454,10 +444,9 @@ static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns,
inc_nlink(dir);
v9fs_invalidate_inode_attr(dir);
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return err;
}
@@ -486,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns,
*/
st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (IS_ERR(st))
return PTR_ERR(st);
@@ -600,7 +589,7 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
retval = p9_client_setattr(fid, &p9attr);
if (retval < 0) {
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return retval;
}
@@ -616,12 +605,12 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns,
retval = v9fs_acl_chmod(inode, fid);
if (retval < 0) {
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return retval;
}
}
if (use_dentry)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return 0;
}
@@ -740,7 +729,6 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -752,9 +740,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err);
goto error;
}
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/* Not in cached mode. No need to populate inode with stat */
@@ -767,10 +754,8 @@ v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir,
}
error:
- if (fid)
- p9_client_clunk(fid);
-
- p9_client_clunk(dfid);
+ p9_fid_put(fid);
+ p9_fid_put(dfid);
return err;
}
@@ -800,14 +785,14 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
oldfid = v9fs_fid_lookup(old_dentry);
if (IS_ERR(oldfid)) {
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return PTR_ERR(oldfid);
}
err = p9_client_link(dfid, oldfid, dentry->d_name.name);
- p9_client_clunk(dfid);
- p9_client_clunk(oldfid);
+ p9_fid_put(dfid);
+ p9_fid_put(oldfid);
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
return err;
@@ -823,7 +808,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
return PTR_ERR(fid);
v9fs_refresh_inode_dotl(fid, d_inode(old_dentry));
- p9_client_clunk(fid);
+ p9_fid_put(fid);
}
ihold(d_inode(old_dentry));
d_instantiate(dentry, d_inode(old_dentry));
@@ -863,7 +848,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
- dfid = NULL;
goto error;
}
@@ -888,7 +872,6 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
err = PTR_ERR(fid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
err);
- fid = NULL;
goto error;
}
@@ -902,9 +885,8 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
goto error;
}
v9fs_set_create_acl(inode, fid, dacl, pacl);
- v9fs_fid_add(dentry, fid);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
- fid = NULL;
err = 0;
} else {
/*
@@ -920,10 +902,9 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir,
d_instantiate(dentry, inode);
}
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
- p9_client_clunk(dfid);
+ p9_fid_put(dfid);
return err;
}
@@ -953,7 +934,7 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
if (IS_ERR(fid))
return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
if (retval)
return ERR_PTR(retval);
set_delayed_call(done, kfree_link, target);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 97e23b4e6982..2d9ee073d12c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -184,13 +184,13 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = v9fs_get_acl(inode, fid);
if (retval)
goto release_sb;
- v9fs_fid_add(root, fid);
+ v9fs_fid_add(root, &fid);
p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
return dget(sb->s_root);
clunk_fid:
- p9_client_clunk(fid);
+ p9_fid_put(fid);
v9fs_session_close(v9ses);
free_session:
kfree(v9ses);
@@ -203,7 +203,7 @@ release_sb:
* attached the fid to dentry so it won't get clunked
* automatically.
*/
- p9_client_clunk(fid);
+ p9_fid_put(fid);
deactivate_locked_super(sb);
return ERR_PTR(retval);
}
@@ -270,7 +270,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
res = simple_statfs(dentry, buf);
done:
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return res;
}
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index a824441b95a2..1f9298a4bd42 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -44,7 +44,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
if (err)
retval = err;
}
- p9_client_clunk(attr_fid);
+ p9_fid_put(attr_fid);
return retval;
}
@@ -71,7 +71,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
if (IS_ERR(fid))
return PTR_ERR(fid);
ret = v9fs_fid_xattr_get(fid, name, buffer, buffer_size);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return ret;
}
@@ -98,7 +98,7 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
if (IS_ERR(fid))
return PTR_ERR(fid);
ret = v9fs_fid_xattr_set(fid, name, value, value_len, flags);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
return ret;
}
@@ -128,7 +128,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
retval);
else
p9_client_write(fid, 0, &from, &retval);
- err = p9_client_clunk(fid);
+ err = p9_fid_put(fid);
if (!retval && err)
retval = err;
return retval;
diff --git a/fs/Kconfig b/fs/Kconfig
index 30b751c7f11a..a547307c1ae8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -245,20 +245,26 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
-config HUGETLB_PAGE_FREE_VMEMMAP
+#
+# Select this config option from the architecture Kconfig, if it is preferred
+# to enable the feature of HugeTLB Vmemmap Optimization (HVO).
+#
+config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ bool
+
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on X86_64
+ depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on SPARSEMEM_VMEMMAP
-config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
- bool "Default freeing vmemmap pages of HugeTLB to on"
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+ bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
default n
- depends on HUGETLB_PAGE_FREE_VMEMMAP
+ depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
help
- When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
- pages associated with each HugeTLB page is default off. Say Y here
- to enable freeing vmemmap pages of HugeTLB by default. It can then
- be disabled on the command line via hugetlb_free_vmemmap=off.
+ The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
+ enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
+ (boot command line) or hugetlb_optimize_vmemmap (sysctl).
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 21c6332fa785..f14478643b91 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on (ARM || (SUPERH && !MMU))
+ depends on ARM || ((M68K || SUPERH) && !MMU)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
@@ -142,45 +142,6 @@ config BINFMT_ZFLAT
help
Support FLAT format compressed binaries
-config BINFMT_SHARED_FLAT
- bool "Enable shared FLAT support"
- depends on BINFMT_FLAT
- help
- Support FLAT shared libraries
-
-config HAVE_AOUT
- def_bool n
-
-config BINFMT_AOUT
- tristate "Kernel support for a.out and ECOFF binaries"
- depends on HAVE_AOUT
- help
- A.out (Assembler.OUTput) is a set of formats for libraries and
- executables used in the earliest versions of UNIX. Linux used
- the a.out formats QMAGIC and ZMAGIC until they were replaced
- with the ELF format.
-
- The conversion to ELF started in 1995. This option is primarily
- provided for historical interest and for the benefit of those
- who need to run binaries from that era.
-
- Most people should answer N here. If you think you may have
- occasional use for this format, enable module support above
- and answer M here to compile this support as a module called
- binfmt_aout.
-
- If any crucial components of your system (such as /sbin/init
- or /lib/ld.so) are still in a.out format, you will have to
- say Y here.
-
-config OSF4_COMPAT
- bool "OSF/1 v4 readv/writev compatibility"
- depends on ALPHA && BINFMT_AOUT
- help
- Say Y if you are using OSF/1 binaries (like Netscape and Acrobat)
- with v4 shared libraries freely available from Compaq. If you're
- going to use shared libraries from Tru64 version 5.0 or later, say N.
-
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
help
diff --git a/fs/Makefile b/fs/Makefile
index 208a74e0b00e..4dea17840761 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -34,13 +34,10 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
-obj-$(CONFIG_IO_URING) += io_uring.o
-obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 561bc748c04a..ee22278b0cfc 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -38,9 +38,9 @@ static int adfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, adfs_get_block, wbc);
}
-static int adfs_readpage(struct file *file, struct page *page)
+static int adfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, adfs_get_block);
+ return block_read_full_folio(folio, adfs_get_block);
}
static void adfs_write_failed(struct address_space *mapping, loff_t to)
@@ -52,13 +52,13 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
}
static int adfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -75,7 +75,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations adfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = adfs_readpage,
+ .read_folio = adfs_read_folio,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
.write_end = generic_write_end,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index b3f81d84ff4c..cefa222f7881 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -375,9 +375,9 @@ static int affs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, affs_get_block, wbc);
}
-static int affs_readpage(struct file *file, struct page *page)
+static int affs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, affs_get_block);
+ return block_read_full_folio(folio, affs_get_block);
}
static void affs_write_failed(struct address_space *mapping, loff_t to)
@@ -414,13 +414,13 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
static int affs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -455,7 +455,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations affs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = affs_readpage,
+ .read_folio = affs_read_folio,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
.write_end = affs_write_end,
@@ -526,7 +526,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create)
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
- char *data;
unsigned pos = 0;
u32 bidx, boff, bsize;
u32 tmp;
@@ -545,15 +544,12 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
- data = kmap_atomic(page);
- memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
- kunmap_atomic(data);
+ memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp);
affs_brelse(bh);
bidx++;
pos += tmp;
boff = 0;
}
- flush_dcache_page(page);
return 0;
}
@@ -629,8 +625,9 @@ out:
}
static int
-affs_readpage_ofs(struct file *file, struct page *page)
+affs_read_folio_ofs(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
u32 to;
int err;
@@ -650,7 +647,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
}
static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -670,7 +667,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -837,7 +834,7 @@ err_bh:
const struct address_space_operations affs_aops_ofs = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = affs_readpage_ofs,
+ .read_folio = affs_read_folio_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
.write_end = affs_write_end_ofs
@@ -887,7 +884,7 @@ affs_truncate(struct inode *inode)
loff_t isize = inode->i_size;
int res;
- res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata);
+ res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata);
if (!res)
res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
else
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4c5f30a83336..58b391446ae1 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -276,7 +276,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
char *vol = match_strdup(&args[0]);
if (!vol)
return 0;
- strlcpy(volume, vol, 32);
+ strscpy(volume, vol, 32);
kfree(vol);
break;
}
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index a7531b26e8f0..31d6446dc166 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -11,8 +11,9 @@
#include "affs.h"
-static int affs_symlink_readpage(struct file *file, struct page *page)
+static int affs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
char *link = page_address(page);
@@ -67,7 +68,7 @@ fail:
}
const struct address_space_operations affs_symlink_aops = {
- .readpage = affs_symlink_readpage,
+ .read_folio = affs_symlink_read_folio,
};
const struct inode_operations affs_symlink_inode_operations = {
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1b4d5809808d..a484fa642808 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work)
{
struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work);
- unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false);
+ unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
}
void afs_server_init_callback_work(struct work_struct *work)
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 07ad744eef77..988c2ac7cece 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -158,7 +158,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name[i] = tolower(name[i]);
cell->name[i] = 0;
- atomic_set(&cell->ref, 1);
+ refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
INIT_WORK(&cell->manager, afs_manage_cell_work);
cell->volumes = RB_ROOT;
@@ -287,7 +287,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
atomic_set(&cell->active, 2);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 2, afs_cell_trace_insert);
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
@@ -295,7 +295,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
afs_queue_cell(cell, afs_cell_trace_get_queue_new);
wait_for_cell:
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), atomic_read(&cell->active),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
afs_cell_trace_wait);
_debug("wait_for_cell");
wait_var_event(&cell->state,
@@ -490,13 +490,13 @@ static void afs_cell_destroy(struct rcu_head *rcu)
{
struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
struct afs_net *net = cell->net;
- int u;
+ int r;
_enter("%p{%s}", cell, cell->name);
- u = atomic_read(&cell->ref);
- ASSERTCMP(u, ==, 0);
- trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), afs_cell_trace_free);
+ r = refcount_read(&cell->ref);
+ ASSERTCMP(r, ==, 0);
+ trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
@@ -539,13 +539,10 @@ void afs_cells_timer(struct timer_list *timer)
*/
struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u;
+ int r;
- if (atomic_read(&cell->ref) <= 0)
- BUG();
-
- u = atomic_inc_return(&cell->ref);
- trace_afs_cell(cell->debug_id, u, atomic_read(&cell->active), reason);
+ __refcount_inc(&cell->ref, &r);
+ trace_afs_cell(cell->debug_id, r + 1, atomic_read(&cell->active), reason);
return cell;
}
@@ -556,12 +553,14 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
if (cell) {
unsigned int debug_id = cell->debug_id;
- unsigned int u, a;
+ unsigned int a;
+ bool zero;
+ int r;
a = atomic_read(&cell->active);
- u = atomic_dec_return(&cell->ref);
- trace_afs_cell(debug_id, u, a, reason);
- if (u == 0) {
+ zero = __refcount_dec_and_test(&cell->ref, &r);
+ trace_afs_cell(debug_id, r - 1, a, reason);
+ if (zero) {
a = atomic_read(&cell->active);
WARN(a != 0, "Cell active count %u > 0\n", a);
call_rcu(&cell->rcu, afs_cell_destroy);
@@ -574,14 +573,12 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
*/
struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u, a;
-
- if (atomic_read(&cell->ref) <= 0)
- BUG();
+ int r, a;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
+ WARN_ON(r == 0);
a = atomic_inc_return(&cell->active);
- trace_afs_cell(cell->debug_id, u, a, reason);
+ trace_afs_cell(cell->debug_id, r, a, reason);
return cell;
}
@@ -593,7 +590,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
{
unsigned int debug_id;
time64_t now, expire_delay;
- int u, a;
+ int r, a;
if (!cell)
return;
@@ -607,9 +604,9 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
expire_delay = afs_cell_gc_delay;
debug_id = cell->debug_id;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
a = atomic_dec_return(&cell->active);
- trace_afs_cell(debug_id, u, a, reason);
+ trace_afs_cell(debug_id, r, a, reason);
WARN_ON(a == 0);
if (a == 1)
/* 'cell' may now be garbage collected. */
@@ -621,11 +618,11 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
*/
void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- int u, a;
+ int r, a;
- u = atomic_read(&cell->ref);
+ r = refcount_read(&cell->ref);
a = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, u, a, reason);
+ trace_afs_cell(cell->debug_id, r, a, reason);
}
/*
@@ -739,7 +736,7 @@ again:
active = 1;
if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
rb_erase(&cell->net_node, &net->cells);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref), 0,
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
afs_cell_trace_unuse_delete);
smp_store_release(&cell->state, AFS_CELL_REMOVED);
}
@@ -866,7 +863,7 @@ void afs_manage_cells(struct work_struct *work)
bool sched_cell = false;
active = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
active, afs_cell_trace_manage);
ASSERTCMP(active, >=, 1);
@@ -874,7 +871,7 @@ void afs_manage_cells(struct work_struct *work)
if (purging) {
if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
active = atomic_dec_return(&cell->active);
- trace_afs_cell(cell->debug_id, atomic_read(&cell->ref),
+ trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
active, afs_cell_trace_unuse_pin);
}
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a3f5de28be79..0a090d614e76 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -212,8 +212,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
* to maintain cache coherency.
*/
if (call->server) {
- trace_afs_server(call->server,
- atomic_read(&call->server->ref),
+ trace_afs_server(call->server->debug_id,
+ refcount_read(&call->server->ref),
atomic_read(&call->server->active),
afs_server_trace_callback);
afs_break_callbacks(call->server, call->count, call->request);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 932e61e28e5d..230c2d19116d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -24,9 +24,9 @@ static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_iput(struct dentry *dentry, struct inode *inode);
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl);
@@ -41,7 +41,7 @@ static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags);
-static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
+static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags);
static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
size_t length);
@@ -75,7 +75,7 @@ const struct inode_operations afs_dir_inode_operations = {
const struct address_space_operations afs_dir_aops = {
.dirty_folio = afs_dir_dirty_folio,
- .releasepage = afs_dir_releasepage,
+ .release_folio = afs_dir_release_folio,
.invalidate_folio = afs_dir_invalidate_folio,
};
@@ -109,7 +109,7 @@ struct afs_lookup_cookie {
*/
static void afs_dir_read_cleanup(struct afs_read *req)
{
- struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = req->vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
@@ -153,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
block = kmap_local_folio(folio, offset);
if (block->hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
- __func__, dvnode->vfs_inode.i_ino,
+ __func__, dvnode->netfs.inode.i_ino,
pos, offset, size, ntohs(block->hdr.magic));
trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
kunmap_local(block);
@@ -183,7 +183,7 @@ error:
static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
union afs_xdr_dir_block *block;
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
size_t offset, size;
@@ -217,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
*/
static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
int ret = 0;
@@ -269,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file)
static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
__acquires(&dvnode->validate_lock)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct afs_read *req;
loff_t i_size;
int nr_pages, i;
@@ -287,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
req->cleanup = afs_dir_read_cleanup;
expand:
- i_size = i_size_read(&dvnode->vfs_inode);
+ i_size = i_size_read(&dvnode->netfs.inode);
if (i_size < 2048) {
ret = afs_bad(dvnode, afs_file_error_dir_small);
goto error;
@@ -305,7 +305,7 @@ expand:
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages,
+ iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
0, i_size);
req->iter = &req->def_iter;
@@ -463,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
}
/* skip if starts before the current position */
- if (offset < curr)
+ if (offset < curr) {
+ if (next > curr)
+ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
continue;
+ }
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
@@ -565,7 +568,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_one_cookie *cookie =
@@ -581,16 +584,16 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name,
if (cookie->name.len != nlen ||
memcmp(cookie->name.name, name, nlen) != 0) {
- _leave(" = 0 [no]");
- return 0;
+ _leave(" = true [keep looking]");
+ return true;
}
cookie->fid.vnode = ino;
cookie->fid.unique = dtype;
cookie->found = 1;
- _leave(" = -1 [found]");
- return -1;
+ _leave(" = false [found]");
+ return false;
}
/*
@@ -633,12 +636,11 @@ static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry,
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+static bool afs_lookup_filldir(struct dir_context *ctx, const char *name,
int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
struct afs_lookup_cookie *cookie =
container_of(ctx, struct afs_lookup_cookie, ctx);
- int ret;
_enter("{%s,%u},%s,%u,,%llu,%u",
cookie->name.name, cookie->name.len, name, nlen,
@@ -660,12 +662,10 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
cookie->fids[1].unique = dtype;
cookie->found = 1;
if (cookie->one_only)
- return -1;
+ return false;
}
- ret = cookie->nr_fids >= 50 ? -1 : 0;
- _leave(" = %d", ret);
- return ret;
+ return cookie->nr_fids < 50;
}
/*
@@ -894,7 +894,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
out_op:
if (op->error == 0) {
- inode = &op->file[1].vnode->vfs_inode;
+ inode = &op->file[1].vnode->netfs.inode;
op->file[1].vnode = NULL;
}
@@ -1136,7 +1136,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
afs_stat_v(dir, n_reval);
/* search the directory for this vnode */
- ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version);
+ ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);
switch (ret) {
case 0:
/* the filename maps to something */
@@ -1167,7 +1167,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
_debug("%pd: file deleted (uq %u -> %u I:%u)",
dentry, fid.unique,
vnode->fid.unique,
- vnode->vfs_inode.i_generation);
+ vnode->netfs.inode.i_generation);
goto not_found;
}
goto out_valid;
@@ -1365,7 +1365,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
if (d_really_is_positive(dentry)) {
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -1484,8 +1484,8 @@ static void afs_dir_remove_link(struct afs_operation *op)
/* Already done */
} else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
write_seqlock(&vnode->cb_lock);
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_unlink);
}
@@ -1501,7 +1501,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
op->error = ret;
}
- _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error);
+ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
}
static void afs_unlink_success(struct afs_operation *op)
@@ -1677,8 +1677,8 @@ static void afs_link_success(struct afs_operation *op)
afs_update_dentry_version(op, dvp, op->dentry);
if (op->dentry_2->d_parent == op->dentry->d_parent)
afs_update_dentry_version(op, dvp, op->dentry_2);
- ihold(&vp->vnode->vfs_inode);
- d_instantiate(op->dentry, &vp->vnode->vfs_inode);
+ ihold(&vp->vnode->netfs.inode);
+ d_instantiate(op->dentry, &vp->vnode->netfs.inode);
}
static void afs_link_put(struct afs_operation *op)
@@ -2002,9 +2002,8 @@ error:
* Release a directory folio and clean up its private state if it's not busy
* - return true if the folio can now be released, false if not
*/
-static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
+static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index d98e109ecee9..0ab7752d1b75 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
*/
static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
folio = __filemap_get_folio(mapping, index,
@@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -336,7 +336,7 @@ found_space:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] -= need_slots;
- inode_inc_iversion_raw(&vnode->vfs_inode);
+ inode_inc_iversion_raw(&vnode->netfs.inode);
afs_stat_v(vnode, n_dir_cr);
_debug("Insert %s in %u[%u]", name->name, b, slot);
@@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size < AFS_DIR_BLOCK_SIZE ||
i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
@@ -463,7 +463,7 @@ found_dirent:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] += need_slots;
- inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
afs_stat_v(vnode, n_dir_rm);
_debug("Remove %s from %u[%u]", name->name, b, slot);
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 45cfd50a9521..bb5807e87fa4 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
goto out;
} while (!d_is_negative(sdentry));
- ihold(&vnode->vfs_inode);
+ ihold(&vnode->netfs.inode);
ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key);
switch (ret) {
@@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
d_drop(sdentry);
}
- iput(&vnode->vfs_inode);
+ iput(&vnode->netfs.inode);
dput(sdentry);
out:
_leave(" = %d", ret);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index f120bcb8bf73..d7d9402ff718 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
- netfs_i_context_init(inode, NULL);
+ netfs_inode_init(&vnode->netfs, NULL);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 26292a110a8f..d1cfb235c4b9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,10 +19,10 @@
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
-static int afs_symlink_readpage(struct file *file, struct page *page);
+static int afs_symlink_read_folio(struct file *file, struct folio *folio);
static void afs_invalidate_folio(struct folio *folio, size_t offset,
size_t length);
-static int afs_releasepage(struct page *page, gfp_t gfp_flags);
+static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static void afs_vm_open(struct vm_area_struct *area);
@@ -50,11 +50,11 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
- .readpage = netfs_readpage,
+ .read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.dirty_folio = afs_dirty_folio,
.launder_folio = afs_launder_folio,
- .releasepage = afs_releasepage,
+ .release_folio = afs_release_folio,
.invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
@@ -63,8 +63,8 @@ const struct address_space_operations afs_file_aops = {
};
const struct address_space_operations afs_symlink_aops = {
- .readpage = afs_symlink_readpage,
- .releasepage = afs_releasepage,
+ .read_folio = afs_symlink_read_folio,
+ .release_folio = afs_release_folio,
.invalidate_folio = afs_invalidate_folio,
};
@@ -194,7 +194,7 @@ int afs_release(struct inode *inode, struct file *file)
afs_put_wb_key(af->wb);
if ((file->f_mode & FMODE_WRITE)) {
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
afs_set_cache_aux(vnode, &aux);
fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size);
} else {
@@ -325,18 +325,17 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->iter = &fsreq->def_iter;
iov_iter_xarray(&fsreq->def_iter, READ,
- &fsreq->vnode->vfs_inode.i_mapping->i_pages,
+ &fsreq->vnode->netfs.inode.i_mapping->i_pages,
fsreq->pos, fsreq->len);
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
}
-static int afs_symlink_readpage(struct file *file, struct page *page)
+static int afs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
struct afs_read *fsreq;
- struct folio *folio = page_folio(page);
int ret;
fsreq = afs_alloc_read(GFP_NOFS);
@@ -347,13 +346,13 @@ static int afs_symlink_readpage(struct file *file, struct page *page)
fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages,
+ iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
if (ret == 0)
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return ret;
}
@@ -376,24 +375,24 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq)
}
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
}
-static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
+static void afs_free_request(struct netfs_io_request *rreq)
{
- key_put(netfs_priv);
+ key_put(rreq->netfs_priv);
}
const struct netfs_request_ops afs_req_ops = {
.init_request = afs_init_request,
+ .free_request = afs_free_request,
.begin_cache_operation = afs_begin_cache_operation,
.check_write_begin = afs_check_write_begin,
.issue_read = afs_issue_read,
- .cleanup = afs_priv_cleanup,
};
int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -482,16 +481,15 @@ static void afs_invalidate_folio(struct folio *folio, size_t offset,
* release a page and clean up its private state if it's not busy
* - return true if the page can now be released, false if not
*/
-static int afs_releasepage(struct page *page, gfp_t gfp)
+static bool afs_release_folio(struct folio *folio, gfp_t gfp)
{
- struct folio *folio = page_folio(page);
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
_enter("{{%llx:%llu}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
gfp);
- /* deny if page is being written to the cache and the caller hasn't
+ /* deny if folio is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
if (folio_test_fscache(folio)) {
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index c4210a3964d8..bbcc5afd1576 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -76,7 +76,7 @@ void afs_lock_op_done(struct afs_call *call)
if (call->error == 0) {
spin_lock(&vnode->lock);
trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0);
- vnode->locked_at = call->reply_time;
+ vnode->locked_at = call->issue_time;
afs_schedule_lock_extension(vnode);
spin_unlock(&vnode->lock);
}
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index d222dfbe976b..7a3803ce3a22 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op)
if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
if (op->file[0].put_vnode)
- iput(&op->file[0].vnode->vfs_inode);
+ iput(&op->file[0].vnode->netfs.inode);
if (op->file[1].put_vnode)
- iput(&op->file[1].vnode->vfs_inode);
+ iput(&op->file[1].vnode->netfs.inode);
if (op->more_files) {
for (i = 0; i < op->nr_files - 2; i++)
if (op->more_files[i].put_vnode)
- iput(&op->more_files[i].vnode->vfs_inode);
+ iput(&op->more_files[i].vnode->netfs.inode);
kfree(op->more_files);
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 4943413d9c5f..7d37f63ef0f0 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -131,7 +131,7 @@ bad:
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
{
- return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry;
+ return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry;
}
static void xdr_decode_AFSCallBack(const __be32 **_bp,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 2fe402483ad5..6d3a3dbe4928 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
*/
static void afs_set_netfs_context(struct afs_vnode *vnode)
{
- netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops);
+ netfs_inode_init(&vnode->netfs, &afs_req_ops);
}
/*
@@ -96,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_flags |= S_NOATIME;
inode->i_uid = make_kuid(&init_user_ns, status->owner);
inode->i_gid = make_kgid(&init_user_ns, status->group);
- set_nlink(&vnode->vfs_inode, status->nlink);
+ set_nlink(&vnode->netfs.inode, status->nlink);
switch (status->type) {
case AFS_FTYPE_FILE:
@@ -104,12 +104,14 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
inode->i_mapping->a_ops = &afs_file_aops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case AFS_FTYPE_DIR:
inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
inode->i_op = &afs_dir_inode_operations;
inode->i_fop = &afs_dir_file_operations;
inode->i_mapping->a_ops = &afs_dir_aops;
+ mapping_set_large_folios(inode->i_mapping);
break;
case AFS_FTYPE_SYMLINK:
/* Symlinks with a mode of 0644 are actually mountpoints. */
@@ -139,7 +141,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version;
- inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver
@@ -163,7 +165,7 @@ static void afs_apply_status(struct afs_operation *op,
{
struct afs_file_status *status = &vp->scb.status;
struct afs_vnode *vnode = vp->vnode;
- struct inode *inode = &vnode->vfs_inode;
+ struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
bool data_changed = false;
@@ -246,7 +248,7 @@ static void afs_apply_status(struct afs_operation *op,
* idea of what the size should be that's not the same as
* what's on the server.
*/
- vnode->netfs_ctx.remote_i_size = status->size;
+ vnode->netfs.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
inode->i_ctime = t;
@@ -289,7 +291,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
*/
if (vp->scb.status.abort_code == VNOVNODE) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
@@ -306,8 +308,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
if (vp->scb.have_cb)
afs_apply_callback(op, vp);
} else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) {
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
}
@@ -326,7 +328,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- if (vnode->vfs_inode.i_state & I_NEW) {
+ if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
op->error = ret;
if (ret == 0)
@@ -430,7 +432,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct afs_vnode_cache_aux aux;
if (vnode->status.type != AFS_FTYPE_FILE) {
- vnode->netfs_ctx.cache = NULL;
+ vnode->netfs.cache = NULL;
return;
}
@@ -457,7 +459,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
{
struct afs_vnode_param *dvp = &op->file[0];
- struct super_block *sb = dvp->vnode->vfs_inode.i_sb;
+ struct super_block *sb = dvp->vnode->netfs.inode.i_sb;
struct afs_vnode *vnode;
struct inode *inode;
int ret;
@@ -582,10 +584,10 @@ static void afs_zap_data(struct afs_vnode *vnode)
/* nuke all the non-dirty pages that aren't locked, mapped or being
* written back in a regular file and completely discard the pages in a
* directory or symlink */
- if (S_ISREG(vnode->vfs_inode.i_mode))
- invalidate_remote_inode(&vnode->vfs_inode);
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ invalidate_remote_inode(&vnode->netfs.inode);
else
- invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
+ invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
}
/*
@@ -683,8 +685,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
key_serial(key));
if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->vfs_inode.i_nlink)
- clear_nlink(&vnode->vfs_inode);
+ if (vnode->netfs.inode.i_nlink)
+ clear_nlink(&vnode->netfs.inode);
goto valid;
}
@@ -740,10 +742,23 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
- int seq = 0;
+ struct key *key;
+ int ret, seq = 0;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+ if (vnode->volume &&
+ !(query_flags & AT_STATX_DONT_SYNC) &&
+ !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ ret = afs_validate(vnode, key);
+ key_put(key);
+ if (ret < 0)
+ return ret;
+ }
+
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(&init_user_ns, inode, stat);
@@ -814,7 +829,7 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
loff_t old_i_size = i_size_read(inode);
op->setattr.old_i_size = old_i_size;
@@ -831,7 +846,7 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
@@ -863,7 +878,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH;
struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- struct inode *inode = &vnode->vfs_inode;
+ struct inode *inode = &vnode->netfs.inode;
loff_t i_size;
int ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7b7ef945dc78..723d162078a3 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -122,7 +122,7 @@ struct afs_call {
};
struct afs_operation *op;
unsigned int server_index;
- atomic_t usage;
+ refcount_t ref;
enum afs_call_state state;
spinlock_t state_lock;
int error; /* error code */
@@ -137,7 +137,6 @@ struct afs_call {
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
- bool have_reply_time; /* T if have got reply_time */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
u16 service_id; /* Actual service ID (after upgrade) */
@@ -151,7 +150,7 @@ struct afs_call {
} __attribute__((packed));
__be64 tmp64;
};
- ktime_t reply_time; /* Time of first reply packet */
+ ktime_t issue_time; /* Time of issue of operation */
};
struct afs_call_type {
@@ -311,7 +310,7 @@ struct afs_net {
atomic_t n_lookup; /* Number of lookups done */
atomic_t n_reval; /* Number of dentries needing revalidation */
atomic_t n_inval; /* Number of invalidations by the server */
- atomic_t n_relpg; /* Number of invalidations by releasepage */
+ atomic_t n_relpg; /* Number of invalidations by release_folio */
atomic_t n_read_dir; /* Number of directory pages read */
atomic_t n_dir_cr; /* Number of directory entry creation edits */
atomic_t n_dir_rm; /* Number of directory entry removal edits */
@@ -365,7 +364,7 @@ struct afs_cell {
struct hlist_node proc_link; /* /proc cell list link */
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
- atomic_t ref; /* Struct refcount */
+ refcount_t ref; /* Struct refcount */
atomic_t active; /* Active usage counter */
unsigned long flags;
#define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */
@@ -410,7 +409,7 @@ struct afs_vlserver {
#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */
rwlock_t lock; /* Lock on addresses */
- atomic_t usage;
+ refcount_t ref;
unsigned int rtt; /* Server's current RTT in uS */
/* Probe state */
@@ -446,7 +445,7 @@ struct afs_vlserver_entry {
struct afs_vlserver_list {
struct rcu_head rcu;
- atomic_t usage;
+ refcount_t ref;
u8 nr_servers;
u8 index; /* Server currently in use */
u8 preferred; /* Preferred server */
@@ -517,7 +516,7 @@ struct afs_server {
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */
#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */
- atomic_t ref; /* Object refcount */
+ refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
unsigned int rtt; /* Server's current RTT in uS */
@@ -571,7 +570,7 @@ struct afs_volume {
struct rcu_head rcu;
afs_volid_t vid; /* volume ID */
};
- atomic_t usage;
+ refcount_t ref;
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
@@ -619,12 +618,7 @@ enum afs_lock_state {
* leak from one inode to another.
*/
struct afs_vnode {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
-
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct afs_volume *volume; /* volume on which vnode resides */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
@@ -675,7 +669,7 @@ struct afs_vnode {
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
- return netfs_i_cookie(&vnode->vfs_inode);
+ return netfs_i_cookie(&vnode->netfs);
#else
return NULL;
#endif
@@ -685,7 +679,7 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
struct fscache_cookie *cookie)
{
#ifdef CONFIG_AFS_FSCACHE
- vnode->netfs_ctx.cache = cookie;
+ vnode->netfs.cache = cookie;
#endif
}
@@ -892,7 +886,7 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
afs_set_cache_aux(vnode, &aux);
fscache_invalidate(afs_vnode_cache(vnode), &aux,
- i_size_read(&vnode->vfs_inode), flags);
+ i_size_read(&vnode->netfs.inode), flags);
}
/*
@@ -1217,7 +1211,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode)
static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)
{
- return afs_i2net(&vnode->vfs_inode);
+ return afs_i2net(&vnode->netfs.inode);
}
static inline struct afs_net *afs_sock2net(struct sock *sk)
@@ -1498,14 +1492,14 @@ extern int afs_end_vlserver_operation(struct afs_vl_cursor *);
*/
static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver)
{
- atomic_inc(&vlserver->usage);
+ refcount_inc(&vlserver->ref);
return vlserver;
}
static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist)
{
if (vllist)
- atomic_inc(&vllist->usage);
+ refcount_inc(&vllist->ref);
return vllist;
}
@@ -1535,7 +1529,7 @@ bool afs_dirty_folio(struct address_space *, struct folio *);
#define afs_dirty_folio filemap_dirty_folio
#endif
extern int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata);
extern int afs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -1593,12 +1587,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *);
*/
static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
{
- return container_of(inode, struct afs_vnode, vfs_inode);
+ return container_of(inode, struct afs_vnode, netfs.inode);
}
static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
{
- return &vnode->vfs_inode;
+ return &vnode->netfs.inode;
}
/*
@@ -1621,8 +1615,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
*/
static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)
{
- i_size_write(&vnode->vfs_inode, size);
- vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+ i_size_write(&vnode->netfs.inode, size);
+ vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1;
}
/*
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 1d1a8debe472..805328ca5428 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -69,6 +69,7 @@ int afs_abort_to_error(u32 abort_code)
/* Unified AFS error table */
case UAEPERM: return -EPERM;
case UAENOENT: return -ENOENT;
+ case UAEAGAIN: return -EAGAIN;
case UAEACCES: return -EACCES;
case UAEBUSY: return -EBUSY;
case UAEEXIST: return -EEXIST;
@@ -163,8 +164,11 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
return;
case -ECONNABORTED:
+ error = afs_abort_to_error(abort_code);
+ fallthrough;
+ case -ENETRESET: /* Responded, but we seem to have changed address */
e->responded = true;
- e->error = afs_abort_to_error(abort_code);
+ e->error = error;
return;
}
}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index bbb2c210d139..97f50e9fd9eb 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -132,12 +132,6 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (IS_ERR(page))
return PTR_ERR(page);
- if (PageError(page)) {
- ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt);
- put_page(page);
- return ret;
- }
-
buf = kmap(page);
ret = -EINVAL;
if (buf[size - 1] == '.')
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e1b863449296..2a0c83d71565 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -47,7 +47,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
/* display one cell per line on subsequent lines */
seq_printf(m, "%3u %3u %6lld %2u %2u %s\n",
- atomic_read(&cell->ref),
+ refcount_read(&cell->ref),
atomic_read(&cell->active),
cell->dns_expiry - ktime_get_real_seconds(),
vllist ? vllist->nr_servers : 0,
@@ -217,7 +217,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
}
seq_printf(m, "%3d %08llx %s %s\n",
- atomic_read(&vol->usage), vol->vid,
+ refcount_read(&vol->ref), vol->vid,
afs_vol_types[vol->type],
vol->name);
@@ -388,7 +388,7 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
alist = rcu_dereference(server->addresses);
seq_printf(m, "%pU %3d %3d\n",
&server->uuid,
- atomic_read(&server->ref),
+ refcount_read(&server->ref),
atomic_read(&server->active));
seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n",
server->flags, server->rtt, server->cb_s_break);
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 79e1a5f6701b..a840c3588ebb 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -292,6 +292,10 @@ bool afs_select_fileserver(struct afs_operation *op)
op->error = error;
goto iterate_address;
+ case -ENETRESET:
+ pr_warn("kAFS: Peer reset %s (op=%x)\n",
+ op->type ? op->type->name : "???", op->debug_id);
+ fallthrough;
case -ECONNRESET:
_debug("call reset");
op->error = error;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 23a1a92d64bb..eccc3cd0cb70 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -145,14 +145,14 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
call->type = type;
call->net = net;
call->debug_id = atomic_inc_return(&rxrpc_debug_id);
- atomic_set(&call->usage, 1);
+ refcount_set(&call->ref, 1);
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
call->iter = &call->def_iter;
o = atomic_inc_return(&net->nr_outstanding_calls);
- trace_afs_call(call, afs_call_trace_alloc, 1, o,
+ trace_afs_call(call->debug_id, afs_call_trace_alloc, 1, o,
__builtin_return_address(0));
return call;
}
@@ -163,14 +163,16 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
void afs_put_call(struct afs_call *call)
{
struct afs_net *net = call->net;
- int n = atomic_dec_return(&call->usage);
- int o = atomic_read(&net->nr_outstanding_calls);
+ unsigned int debug_id = call->debug_id;
+ bool zero;
+ int r, o;
- trace_afs_call(call, afs_call_trace_put, n, o,
+ zero = __refcount_dec_and_test(&call->ref, &r);
+ o = atomic_read(&net->nr_outstanding_calls);
+ trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
__builtin_return_address(0));
- ASSERTCMP(n, >=, 0);
- if (n == 0) {
+ if (zero) {
ASSERT(!work_pending(&call->async_work));
ASSERT(call->type->name != NULL);
@@ -185,7 +187,7 @@ void afs_put_call(struct afs_call *call)
afs_put_addrlist(call->alist);
kfree(call->request);
- trace_afs_call(call, afs_call_trace_free, 0, o,
+ trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
__builtin_return_address(0));
kfree(call);
@@ -198,9 +200,11 @@ void afs_put_call(struct afs_call *call)
static struct afs_call *afs_get_call(struct afs_call *call,
enum afs_call_trace why)
{
- int u = atomic_inc_return(&call->usage);
+ int r;
- trace_afs_call(call, why, u,
+ __refcount_inc(&call->ref, &r);
+
+ trace_afs_call(call->debug_id, why, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
return call;
@@ -347,6 +351,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
if (call->max_lifespan)
rxrpc_kernel_set_max_life(call->net->socket, rxcall,
call->max_lifespan);
+ call->issue_time = ktime_get_real();
/* send the request */
iov[0].iov_base = call->request;
@@ -497,12 +502,6 @@ static void afs_deliver_to_call(struct afs_call *call)
return;
}
- if (!call->have_reply_time &&
- rxrpc_kernel_get_reply_time(call->net->socket,
- call->rxcall,
- &call->reply_time))
- call->have_reply_time = true;
-
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
if (ret == 0 && call->unmarshalling_error)
@@ -537,6 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
+ case -ENOMEM:
+ case -EFAULT:
abort_code = RXGEN_CC_UNMARSHAL;
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
@@ -544,7 +545,7 @@ static void afs_deliver_to_call(struct afs_call *call)
abort_code, ret, "KUM");
goto local_abort;
default:
- abort_code = RX_USER_ABORT;
+ abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KER");
goto local_abort;
@@ -666,14 +667,13 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long call_user_ID)
{
struct afs_call *call = (struct afs_call *)call_user_ID;
- int u;
+ int r;
trace_afs_notify_call(rxcall, call);
call->need_attention = true;
- u = atomic_fetch_add_unless(&call->usage, 1, 0);
- if (u != 0) {
- trace_afs_call(call, afs_call_trace_wake, u + 1,
+ if (__refcount_inc_not_zero(&call->ref, &r)) {
+ trace_afs_call(call->debug_id, afs_call_trace_wake, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
@@ -836,7 +836,7 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
fallthrough;
default:
_leave(" [error]");
@@ -878,7 +878,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
}
_leave(" [error]");
}
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3c7a8fc4f93f..7c6a63a30394 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -219,8 +219,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
* yet.
*/
size++;
- new = kzalloc(sizeof(struct afs_permits) +
- sizeof(struct afs_permit) * size, GFP_NOFS);
+ new = kzalloc(struct_size(new, permits, size), GFP_NOFS);
if (!new)
goto out_put;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 6e5b9a19b234..4981baf97835 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -228,7 +228,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
if (!server)
goto enomem;
- atomic_set(&server->ref, 1);
+ refcount_set(&server->ref, 1);
atomic_set(&server->active, 1);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
RCU_INIT_POINTER(server->addresses, alist);
@@ -243,7 +243,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
server->rtt = UINT_MAX;
afs_inc_servers_outstanding(net);
- trace_afs_server(server, 1, 1, afs_server_trace_alloc);
+ trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
_leave(" = %p", server);
return server;
@@ -352,9 +352,12 @@ void afs_servers_timer(struct timer_list *timer)
struct afs_server *afs_get_server(struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int u = atomic_inc_return(&server->ref);
+ unsigned int a;
+ int r;
- trace_afs_server(server, u, atomic_read(&server->active), reason);
+ __refcount_inc(&server->ref, &r);
+ a = atomic_read(&server->active);
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -364,14 +367,14 @@ struct afs_server *afs_get_server(struct afs_server *server,
static struct afs_server *afs_maybe_use_server(struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int r = atomic_fetch_add_unless(&server->ref, 1, 0);
unsigned int a;
+ int r;
- if (r == 0)
+ if (!__refcount_inc_not_zero(&server->ref, &r))
return NULL;
a = atomic_inc_return(&server->active);
- trace_afs_server(server, r, a, reason);
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -380,10 +383,13 @@ static struct afs_server *afs_maybe_use_server(struct afs_server *server,
*/
struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
{
- unsigned int r = atomic_inc_return(&server->ref);
- unsigned int a = atomic_inc_return(&server->active);
+ unsigned int a;
+ int r;
- trace_afs_server(server, r, a, reason);
+ __refcount_inc(&server->ref, &r);
+ a = atomic_inc_return(&server->active);
+
+ trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
}
@@ -393,14 +399,17 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra
void afs_put_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- unsigned int usage;
+ unsigned int a, debug_id = server->debug_id;
+ bool zero;
+ int r;
if (!server)
return;
- usage = atomic_dec_return(&server->ref);
- trace_afs_server(server, usage, atomic_read(&server->active), reason);
- if (unlikely(usage == 0))
+ a = atomic_inc_return(&server->active);
+ zero = __refcount_dec_and_test(&server->ref, &r);
+ trace_afs_server(debug_id, r - 1, a, reason);
+ if (unlikely(zero))
__afs_put_server(net, server);
}
@@ -436,7 +445,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
{
struct afs_server *server = container_of(rcu, struct afs_server, rcu);
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
atomic_read(&server->active), afs_server_trace_free);
afs_put_addrlist(rcu_access_pointer(server->addresses));
kfree(server);
@@ -487,7 +496,7 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
active = atomic_read(&server->active);
if (active == 0) {
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
active, afs_server_trace_gc);
next = rcu_dereference_protected(
server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
@@ -553,7 +562,7 @@ void afs_manage_servers(struct work_struct *work)
_debug("manage %pU %u", &server->uuid, active);
if (purging) {
- trace_afs_server(server, atomic_read(&server->ref),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
active, afs_server_trace_purging);
if (active != 0)
pr_notice("Can't purge s=%08x\n", server->debug_id);
@@ -633,7 +642,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
_enter("");
- trace_afs_server(server, atomic_read(&server->ref), atomic_read(&server->active),
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
+ atomic_read(&server->active),
afs_server_trace_update);
alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fea195b0b27..95d713074dc8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -659,7 +659,7 @@ static void afs_i_init_once(void *_vnode)
struct afs_vnode *vnode = _vnode;
memset(vnode, 0, sizeof(*vnode));
- inode_init_once(&vnode->vfs_inode);
+ inode_init_once(&vnode->netfs.inode);
mutex_init(&vnode->io_lock);
init_rwsem(&vnode->validate_lock);
spin_lock_init(&vnode->wb_lock);
@@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
init_rwsem(&vnode->rmdir_lock);
INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work);
- _leave(" = %p", &vnode->vfs_inode);
- return &vnode->vfs_inode;
+ _leave(" = %p", &vnode->netfs.inode);
+ return &vnode->netfs.inode;
}
static void afs_free_inode(struct inode *inode)
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 38b2ba1d9ec0..acc48216136a 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -17,7 +17,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
GFP_KERNEL);
if (vlserver) {
- atomic_set(&vlserver->usage, 1);
+ refcount_set(&vlserver->ref, 1);
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
@@ -39,13 +39,9 @@ static void afs_vlserver_rcu(struct rcu_head *rcu)
void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver)
{
- if (vlserver) {
- unsigned int u = atomic_dec_return(&vlserver->usage);
- //_debug("VL PUT %p{%u}", vlserver, u);
-
- if (u == 0)
- call_rcu(&vlserver->rcu, afs_vlserver_rcu);
- }
+ if (vlserver &&
+ refcount_dec_and_test(&vlserver->ref))
+ call_rcu(&vlserver->rcu, afs_vlserver_rcu);
}
struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
@@ -54,7 +50,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL);
if (vllist) {
- atomic_set(&vllist->usage, 1);
+ refcount_set(&vllist->ref, 1);
rwlock_init(&vllist->lock);
}
@@ -64,10 +60,7 @@ struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers)
void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist)
{
if (vllist) {
- unsigned int u = atomic_dec_return(&vllist->usage);
-
- //_debug("VLLS PUT %p{%u}", vllist, u);
- if (u == 0) {
+ if (refcount_dec_and_test(&vllist->ref)) {
int i;
for (i = 0; i < vllist->nr_servers; i++) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 94a3d247924b..f4937029dcd7 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -9,8 +9,7 @@
#include <linux/slab.h>
#include "internal.h"
-unsigned __read_mostly afs_volume_gc_delay = 10;
-unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static unsigned __read_mostly afs_volume_record_life = 60 * 60;
/*
* Insert a volume into a cell. If there's an existing volume record, that is
@@ -53,7 +52,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
struct afs_cell *cell = volume->cell;
if (!hlist_unhashed(&volume->proc_link)) {
- trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+ trace_afs_volume(volume->vid, refcount_read(&cell->ref),
afs_volume_trace_remove);
write_seqlock(&cell->volume_lock);
hlist_del_rcu(&volume->proc_link);
@@ -88,7 +87,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->type_force = params->force;
volume->name_len = vldb->name_len;
- atomic_set(&volume->usage, 1);
+ refcount_set(&volume->ref, 1);
INIT_HLIST_NODE(&volume->proc_link);
rwlock_init(&volume->servers_lock);
rwlock_init(&volume->cb_v_break_lock);
@@ -229,7 +228,7 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(net, rcu_access_pointer(volume->servers));
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
- trace_afs_volume(volume->vid, atomic_read(&volume->usage),
+ trace_afs_volume(volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
@@ -243,8 +242,10 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
enum afs_volume_trace reason)
{
if (volume) {
- int u = atomic_inc_return(&volume->usage);
- trace_afs_volume(volume->vid, u, reason);
+ int r;
+
+ __refcount_inc(&volume->ref, &r);
+ trace_afs_volume(volume->vid, r + 1, reason);
}
return volume;
}
@@ -258,9 +259,12 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
{
if (volume) {
afs_volid_t vid = volume->vid;
- int u = atomic_dec_return(&volume->usage);
- trace_afs_volume(vid, u, reason);
- if (u == 0)
+ bool zero;
+ int r;
+
+ zero = __refcount_dec_and_test(&volume->ref, &r);
+ trace_afs_volume(vid, r - 1, reason);
+ if (zero)
afs_destroy_volume(net, volume);
}
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 6bcf1475511b..9ebdd36eaf2f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -42,7 +42,7 @@ static void afs_folio_start_fscache(bool caching, struct folio *folio)
* prepare to perform part of a write to a page
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **_page, void **fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
@@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata);
+ ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
if (ret < 0)
return ret;
@@ -91,7 +91,7 @@ try_again:
goto flush_conflicting_write;
}
- *_page = &folio->page;
+ *_page = folio_file_page(folio, pos / PAGE_SIZE);
_leave(" = 0");
return 0;
@@ -146,10 +146,10 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_end_pos = pos + copied;
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (write_end_pos > i_size) {
write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (write_end_pos > i_size)
afs_set_i_size(vnode, write_end_pos);
write_sequnlock(&vnode->cb_lock);
@@ -257,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t end;
@@ -354,7 +354,6 @@ static const struct afs_operation_ops afs_store_data_operation = {
static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
bool laundering)
{
- struct netfs_i_context *ictx = &vnode->netfs_ctx;
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
loff_t size = iov_iter_count(iter);
@@ -385,9 +384,9 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
op->store.write_iter = iter;
op->store.pos = pos;
op->store.size = size;
- op->store.i_size = max(pos + size, ictx->remote_i_size);
+ op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
op->store.laundering = laundering;
- op->mtime = vnode->vfs_inode.i_mtime;
+ op->mtime = vnode->netfs.inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
op->ops = &afs_store_data_operation;
@@ -554,7 +553,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
struct iov_iter iter;
unsigned long priv;
unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->vfs_inode);
+ loff_t i_size = i_size_read(&vnode->netfs.inode);
bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
long count = wbc->nr_to_write;
@@ -616,8 +615,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
_debug("write discard %x @%llx [%llx]", len, start, i_size);
/* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(afs_vnode_cache(vnode),
- mapping, start, len, caching);
+ fscache_clear_page_bits(mapping, start, len, caching);
afs_pages_written_back(vnode, start, len);
ret = 0;
}
@@ -637,6 +635,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
+ case -ENETRESET:
afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, ret);
break;
@@ -845,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
_enter("{%llx:%llu},{%zu},",
vnode->fid.vid, vnode->fid.vnode, count);
- if (IS_SWAPFILE(&vnode->vfs_inode)) {
+ if (IS_SWAPFILE(&vnode->netfs.inode)) {
printk(KERN_INFO
"AFS: Attempt to write to active swap file!\n");
return -EBUSY;
@@ -958,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/* Discard unused keys */
spin_lock(&vnode->wb_lock);
- if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
- !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) {
+ if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
+ !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) {
list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
if (refcount_read(&wbk->usage) == 1)
list_move(&wbk->vnode_link, &graveyard);
@@ -1034,6 +1033,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode,
bool caching)
{
fscache_write_to_cache(afs_vnode_cache(vnode),
- vnode->vfs_inode.i_mapping, start, len, i_size,
+ vnode->netfs.inode.i_mapping, start, len, i_size,
afs_write_to_cache_done, vnode, caching);
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index fdc7d675b4b0..11571cca86c1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -232,8 +232,7 @@ static void xdr_decode_YFSCallBack(const __be32 **_bp,
struct afs_callback *cb = &scb->callback;
ktime_t cb_expiry;
- cb_expiry = call->reply_time;
- cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100);
+ cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100);
cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC);
scb->have_cb = true;
*_bp += xdr_size(x);
diff --git a/fs/aio.c b/fs/aio.c
index 3c249b938632..606613e9d1f4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -400,8 +400,8 @@ static const struct file_operations aio_ring_fops = {
};
#if IS_ENABLED(CONFIG_MIGRATION)
-static int aio_migratepage(struct address_space *mapping, struct page *new,
- struct page *old, enum migrate_mode mode)
+static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
{
struct kioctx *ctx;
unsigned long flags;
@@ -435,10 +435,10 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out;
}
- idx = old->index;
+ idx = src->index;
if (idx < (pgoff_t)ctx->nr_pages) {
- /* Make sure the old page hasn't already been changed */
- if (ctx->ring_pages[idx] != old)
+ /* Make sure the old folio hasn't already been changed */
+ if (ctx->ring_pages[idx] != &src->page)
rc = -EAGAIN;
} else
rc = -EINVAL;
@@ -447,27 +447,27 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out_unlock;
/* Writeback must be complete */
- BUG_ON(PageWriteback(old));
- get_page(new);
+ BUG_ON(folio_test_writeback(src));
+ folio_get(dst);
- rc = migrate_page_move_mapping(mapping, new, old, 1);
+ rc = folio_migrate_mapping(mapping, dst, src, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
- put_page(new);
+ folio_put(dst);
goto out_unlock;
}
/* Take completion_lock to prevent other writes to the ring buffer
- * while the old page is copied to the new. This prevents new
+ * while the old folio is copied to the new. This prevents new
* events from being lost.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- BUG_ON(ctx->ring_pages[idx] != old);
- ctx->ring_pages[idx] = new;
+ folio_migrate_copy(dst, src);
+ BUG_ON(ctx->ring_pages[idx] != &src->page);
+ ctx->ring_pages[idx] = &dst->page;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- /* The old page is no longer accessible. */
- put_page(old);
+ /* The old folio is no longer accessible. */
+ folio_put(src);
out_unlock:
mutex_unlock(&ctx->ring_lock);
@@ -475,13 +475,13 @@ out:
spin_unlock(&mapping->private_lock);
return rc;
}
+#else
+#define aio_migrate_folio NULL
#endif
static const struct address_space_operations aio_ctx_aops = {
.dirty_folio = noop_dirty_folio,
-#if IS_ENABLED(CONFIG_MIGRATION)
- .migratepage = aio_migratepage,
-#endif
+ .migrate_folio = aio_migrate_folio,
};
static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
@@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = iocb_flags(req->ki_filp);
+ req->ki_flags = req->ki_filp->f_iocb_flags;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e0c3e33c4177..24192a7667ed 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -32,7 +32,7 @@ static struct inode *anon_inode_inode;
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
+ return dynamic_dname(buffer, buflen, "anon_inode:%s",
dentry->d_name.name);
}
diff --git a/fs/attr.c b/fs/attr.c
index 66899b6e9bd8..1552a5f23d6b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -22,7 +22,7 @@
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @uid: uid to chown @inode to
+ * @ia_vfsuid: uid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -31,15 +31,15 @@
* performed on the raw inode simply passs init_user_ns.
*/
static bool chown_ok(struct user_namespace *mnt_userns,
- const struct inode *inode,
- kuid_t uid)
+ const struct inode *inode, vfsuid_t ia_vfsuid)
{
- kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()) &&
+ vfsuid_eq(ia_vfsuid, vfsuid))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (uid_eq(kuid, INVALID_UID) &&
+ if (!vfsuid_valid(vfsuid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* chgrp_ok - verify permissions to chgrp inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @gid: gid to chown @inode to
+ * @ia_vfsgid: gid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -58,15 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* performed on the raw inode simply passs init_user_ns.
*/
static bool chgrp_ok(struct user_namespace *mnt_userns,
- const struct inode *inode, kgid_t gid)
+ const struct inode *inode, vfsgid_t ia_vfsgid)
{
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
- (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
- return true;
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) {
+ if (vfsgid_eq(ia_vfsgid, vfsgid))
+ return true;
+ if (vfsgid_in_group_p(ia_vfsgid))
+ return true;
+ }
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (gid_eq(kgid, INVALID_GID) &&
+ if (!vfsgid_valid(vfsgid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -114,21 +118,30 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
goto kill_priv;
/* Make sure a caller can chown. */
- if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
+ if ((ia_valid & ATTR_UID) &&
+ !chown_ok(mnt_userns, inode, attr->ia_vfsuid))
return -EPERM;
/* Make sure caller can chgrp. */
- if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
+ if ((ia_valid & ATTR_GID) &&
+ !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid))
return -EPERM;
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
+ vfsgid_t vfsgid;
+
if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
+
+ if (ia_valid & ATTR_GID)
+ vfsgid = attr->ia_vfsgid;
+ else
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
/* Also check the setgid bit! */
- if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
- i_gid_into_mnt(mnt_userns, inode)) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!vfsgid_in_group_p(vfsgid) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
@@ -170,6 +183,8 @@ EXPORT_SYMBOL(setattr_prepare);
*/
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
+ if (offset < 0)
+ return -EINVAL;
if (inode->i_size < offset) {
unsigned long limit;
@@ -205,9 +220,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
* setattr_copy must be called with i_mutex held.
*
* setattr_copy updates the inode's metadata with that specified
- * in attr on idmapped mounts. If file ownership is changed setattr_copy
- * doesn't map ia_uid and ia_gid. It will asssume the caller has already
- * provided the intended values. Necessary permission checks to determine
+ * in attr on idmapped mounts. Necessary permission checks to determine
* whether or not the S_ISGID property needs to be removed are performed with
* the correct idmapped mount permission helpers.
* Noticeably missing is inode size update, which is more complex
@@ -228,10 +241,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_MTIME)
@@ -240,8 +251,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (!in_group_p(kgid) &&
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
@@ -292,9 +303,6 @@ EXPORT_SYMBOL(may_setattr);
* retry. Because breaking a delegation may take a long time, the
* caller should drop the i_mutex before doing so.
*
- * If file ownership is changed notify_change() doesn't map ia_uid and
- * ia_gid. It will asssume the caller has already provided the intended values.
- *
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported. Also, passing NULL is fine for callers holding
@@ -383,23 +391,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
* namespace of the superblock.
*/
if (ia_valid & ATTR_UID &&
- !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+ !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsuid))
return -EOVERFLOW;
if (ia_valid & ATTR_GID &&
- !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+ !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsgid))
return -EOVERFLOW;
/* Don't allow modifications of files with invalid uids or
* gids unless those uids & gids are being made valid.
*/
if (!(ia_valid & ATTR_UID) &&
- !uid_valid(i_uid_into_mnt(mnt_userns, inode)))
+ !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)))
return -EOVERFLOW;
if (!(ia_valid & ATTR_GID) &&
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
- error = security_inode_setattr(dentry, attr);
+ error = security_inode_setattr(mnt_userns, dentry, attr);
if (error)
return error;
error = try_break_deleg(inode, delegated_inode);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 918826eaceea..d5a44fa88acf 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -51,8 +51,6 @@ extern struct file_system_type autofs_fs_type;
*/
struct autofs_info {
struct dentry *dentry;
- struct inode *inode;
-
int flags;
struct completion expire_complete;
@@ -148,6 +146,11 @@ static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
task_pgrp(current) == sbi->oz_pgrp);
}
+static inline bool autofs_empty(struct autofs_info *ino)
+{
+ return ino->count < 2;
+}
+
struct inode *autofs_get_inode(struct super_block *, umode_t);
void autofs_free_ino(struct autofs_info *);
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index b3fefd6237c3..038b3d2d9f57 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -371,7 +371,7 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
}
- if (simple_empty(dentry))
+ if (autofs_empty(ino))
return NULL;
/* Case 2: tree mount, expire iff entire tree is not busy */
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 9edf243713eb..affa70360b1f 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,6 +20,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
INIT_LIST_HEAD(&ino->expiring);
ino->last_used = jiffies;
ino->sbi = sbi;
+ ino->count = 1;
}
return ino;
}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 91fe4548c256..ca03c1cae2be 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -10,6 +10,7 @@
#include "autofs_i.h"
+static int autofs_dir_permission(struct user_namespace *, struct inode *, int);
static int autofs_dir_symlink(struct user_namespace *, struct inode *,
struct dentry *, const char *);
static int autofs_dir_unlink(struct inode *, struct dentry *);
@@ -50,6 +51,7 @@ const struct file_operations autofs_dir_operations = {
const struct inode_operations autofs_dir_inode_operations = {
.lookup = autofs_lookup,
+ .permission = autofs_dir_permission,
.unlink = autofs_dir_unlink,
.symlink = autofs_dir_symlink,
.mkdir = autofs_dir_mkdir,
@@ -77,6 +79,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
@@ -93,7 +96,7 @@ static int autofs_dir_open(struct inode *inode, struct file *file)
* it.
*/
spin_lock(&sbi->lookup_lock);
- if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) {
+ if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) {
spin_unlock(&sbi->lookup_lock);
return -ENOENT;
}
@@ -288,9 +291,26 @@ static struct dentry *autofs_mountpoint_changed(struct path *path)
struct dentry *dentry = path->dentry;
struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
- /*
- * If this is an indirect mount the dentry could have gone away
- * as a result of an expire and a new one created.
+ /* If this is an indirect mount the dentry could have gone away
+ * and a new one created.
+ *
+ * This is unusual and I can't remember the case for which it
+ * was originally added now. But an example of how this can
+ * happen is an autofs indirect mount that has the "browse"
+ * option set and also has the "symlink" option in the autofs
+ * map entry. In this case the daemon will remove the browse
+ * directory and create a symlink as the mount leaving the
+ * struct path stale.
+ *
+ * Another not so obvious case is when a mount in an autofs
+ * indirect mount that uses the "nobrowse" option is being
+ * expired at the same time as a path walk. If the mount has
+ * been umounted but the mount point directory seen before
+ * becoming unhashed (during a lockless path walk) when a stat
+ * family system call is made the mount won't be re-mounted as
+ * it should. In this case the mount point that's been removed
+ * (by the daemon) will be stale and the a new mount point
+ * dentry created.
*/
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
@@ -362,7 +382,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
* the mount never trigger mounts themselves (they have an
* autofs trigger mount mounted on them). But v4 pseudo direct
* mounts do need the leaves to trigger mounts. In this case
- * we have no choice but to use the list_empty() check and
+ * we have no choice but to use the autofs_empty() check and
* require user space behave.
*/
if (sbi->version > 4) {
@@ -371,7 +391,7 @@ static struct vfsmount *autofs_d_automount(struct path *path)
goto done;
}
} else {
- if (!simple_empty(dentry)) {
+ if (!autofs_empty(ino)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
@@ -426,9 +446,8 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
if (rcu_walk) {
/* We don't need fs_lock in rcu_walk mode,
- * just testing 'AUTOFS_INFO_NO_RCU' is enough.
- * simple_empty() takes a spinlock, so leave it
- * to last.
+ * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough.
+ *
* We only return -EISDIR when certain this isn't
* a mount-trap.
*/
@@ -441,9 +460,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
inode = d_inode_rcu(dentry);
if (inode && S_ISLNK(inode->i_mode))
return -EISDIR;
- if (list_empty(&dentry->d_subdirs))
- return 0;
- if (!simple_empty(dentry))
+ if (!autofs_empty(ino))
return -EISDIR;
return 0;
}
@@ -463,7 +480,7 @@ static int autofs_d_manage(const struct path *path, bool rcu_walk)
* we can avoid needless calls ->d_automount() and avoid
* an incorrect ELOOP error return.
*/
- if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
+ if ((!path_is_mountpoint(path) && !autofs_empty(ino)) ||
(d_really_is_positive(dentry) && d_is_symlink(dentry)))
status = -EISDIR;
}
@@ -526,11 +543,30 @@ static struct dentry *autofs_lookup(struct inode *dir,
return NULL;
}
+static int autofs_dir_permission(struct user_namespace *mnt_userns,
+ struct inode *inode, int mask)
+{
+ if (mask & MAY_WRITE) {
+ struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
+
+ if (!autofs_oz_mode(sbi))
+ return -EACCES;
+
+ /* autofs_oz_mode() needs to allow path walks when the
+ * autofs mount is catatonic but the state of an autofs
+ * file system needs to be preserved over restarts.
+ */
+ if (sbi->flags & AUTOFS_SBI_CATATONIC)
+ return -EACCES;
+ }
+
+ return generic_permission(mnt_userns, inode, mask);
+}
+
static int autofs_dir_symlink(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
struct inode *inode;
@@ -539,16 +575,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
pr_debug("%s <- %pd\n", symname, dentry);
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
BUG_ON(!ino);
autofs_clean_ino(ino);
@@ -571,7 +597,6 @@ static int autofs_dir_symlink(struct user_namespace *mnt_userns,
d_add(dentry, inode);
dget(dentry);
- ino->count++;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
@@ -601,17 +626,6 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
- ino->count--;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
dput(ino->dentry);
@@ -683,16 +697,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
pr_debug("dentry %p, removing %pd\n", dentry, dentry);
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
if (ino->count != 1)
return -ENOTEMPTY;
@@ -704,7 +708,6 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
if (sbi->version < 5)
autofs_clear_leaf_automount_flags(dentry);
- ino->count--;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count--;
dput(ino->dentry);
@@ -726,16 +729,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
struct autofs_info *p_ino;
struct inode *inode;
- if (!autofs_oz_mode(sbi))
- return -EACCES;
-
- /* autofs_oz_mode() needs to allow path walks when the
- * autofs mount is catatonic but the state of an autofs
- * file system needs to be preserved over restarts.
- */
- if (sbi->flags & AUTOFS_SBI_CATATONIC)
- return -EACCES;
-
pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
@@ -753,7 +746,6 @@ static int autofs_dir_mkdir(struct user_namespace *mnt_userns,
autofs_set_leaf_automount_flags(dentry);
dget(dentry);
- ino->count++;
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b4b3567ac655..32749fcee090 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -40,7 +40,7 @@ MODULE_LICENSE("GPL");
static int befs_readdir(struct file *, struct dir_context *);
static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
-static int befs_readpage(struct file *file, struct page *page);
+static int befs_read_folio(struct file *file, struct folio *folio);
static sector_t befs_bmap(struct address_space *mapping, sector_t block);
static struct dentry *befs_lookup(struct inode *, struct dentry *,
unsigned int);
@@ -48,7 +48,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_free_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static int befs_symlink_readpage(struct file *, struct page *);
+static int befs_symlink_read_folio(struct file *, struct folio *);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -87,12 +87,12 @@ static const struct inode_operations befs_dir_inode_operations = {
};
static const struct address_space_operations befs_aops = {
- .readpage = befs_readpage,
+ .read_folio = befs_read_folio,
.bmap = befs_bmap,
};
static const struct address_space_operations befs_symlink_aops = {
- .readpage = befs_symlink_readpage,
+ .read_folio = befs_symlink_read_folio,
};
static const struct export_operations befs_export_operations = {
@@ -102,16 +102,15 @@ static const struct export_operations befs_export_operations = {
};
/*
- * Called by generic_file_read() to read a page of data
+ * Called by generic_file_read() to read a folio of data
*
* In turn, simply calls a generic block read function and
* passes it the address of befs_get_block, for mapping file
* positions to disk blocks.
*/
-static int
-befs_readpage(struct file *file, struct page *page)
+static int befs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, befs_get_block);
+ return block_read_full_folio(folio, befs_get_block);
}
static sector_t
@@ -468,14 +467,14 @@ befs_destroy_inodecache(void)
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static int befs_symlink_readpage(struct file *unused, struct page *page)
+static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct befs_inode_info *befs_ino = BEFS_I(inode);
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
- char *link = page_address(page);
+ char *link = folio_address(folio);
if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
@@ -488,12 +487,12 @@ static int befs_symlink_readpage(struct file *unused, struct page *page)
goto fail;
}
link[len - 1] = '\0';
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_set_error(folio);
+ folio_unlock(folio);
return -EIO;
}
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 03139344568f..57ae5ee6deec 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -155,9 +155,9 @@ static int bfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, bfs_get_block, wbc);
}
-static int bfs_readpage(struct file *file, struct page *page)
+static int bfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, bfs_get_block);
+ return block_read_full_folio(folio, bfs_get_block);
}
static void bfs_write_failed(struct address_space *mapping, loff_t to)
@@ -169,13 +169,12 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
}
static int bfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- bfs_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block);
if (unlikely(ret))
bfs_write_failed(mapping, pos + len);
@@ -190,7 +189,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations bfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = bfs_readpage,
+ .read_folio = bfs_read_folio,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
.write_end = generic_write_end,
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
deleted file mode 100644
index 0dcfc691e7e2..000000000000
--- a/fs/binfmt_aout.c
+++ /dev/null
@@ -1,342 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/binfmt_aout.c
- *
- * Copyright (C) 1991, 1992, 1996 Linus Torvalds
- */
-
-#include <linux/module.h>
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <linux/coredump.h>
-#include <linux/slab.h>
-#include <linux/sched/task_stack.h>
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-
-static int load_aout_binary(struct linux_binprm *);
-static int load_aout_library(struct file*);
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
-};
-
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end > start)
- return vm_brk(start, end - start);
- return 0;
-}
-
-/*
- * create_aout_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm)
-{
- char __user * __user *argv;
- char __user * __user *envp;
- unsigned long __user *sp;
- int argc = bprm->argc;
- int envc = bprm->envc;
-
- sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __alpha__
-/* whee.. test-programs are so much fun. */
- put_user(0, --sp);
- put_user(0, --sp);
- if (bprm->loader) {
- put_user(0, --sp);
- put_user(1003, --sp);
- put_user(bprm->loader, --sp);
- put_user(1002, --sp);
- }
- put_user(bprm->exec, --sp);
- put_user(1001, --sp);
-#endif
- sp -= envc+1;
- envp = (char __user * __user *) sp;
- sp -= argc+1;
- argv = (char __user * __user *) sp;
-#ifndef __alpha__
- put_user((unsigned long) envp,--sp);
- put_user((unsigned long) argv,--sp);
-#endif
- put_user(argc,--sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-->0) {
- char c;
- put_user(p,argv++);
- do {
- get_user(c,p++);
- } while (c);
- }
- put_user(NULL,argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-->0) {
- char c;
- put_user(p,envp++);
- do {
- get_user(c,p++);
- } while (c);
- }
- put_user(NULL,envp);
- current->mm->env_end = (unsigned long) p;
- return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries. There is no binary dependent code anywhere else.
- */
-
-static int load_aout_binary(struct linux_binprm * bprm)
-{
- struct pt_regs *regs = current_pt_regs();
- struct exec ex;
- unsigned long error;
- unsigned long fd_offset;
- unsigned long rlim;
- int retval;
-
- ex = *((struct exec *) bprm->buf); /* exec-header */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
- N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
- N_TRSIZE(ex) || N_DRSIZE(ex) ||
- i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- return -ENOEXEC;
- }
-
- /*
- * Requires a mmap handler. This prevents people from using a.out
- * as part of an exploit attack against /proc-related vulnerabilities.
- */
- if (!bprm->file->f_op->mmap)
- return -ENOEXEC;
-
- fd_offset = N_TXTOFF(ex);
-
- /* Check initial limits. This avoids letting people circumvent
- * size limits imposed on them by creating programs with large
- * arrays in the data or bss.
- */
- rlim = rlimit(RLIMIT_DATA);
- if (rlim >= RLIM_INFINITY)
- rlim = ~0;
- if (ex.a_data + ex.a_bss > rlim)
- return -ENOMEM;
-
- /* Flush all traces of the currently running executable */
- retval = begin_new_exec(bprm);
- if (retval)
- return retval;
-
- /* OK, This is the point of no return */
-#ifdef __alpha__
- SET_AOUT_PERSONALITY(bprm, ex);
-#else
- set_personality(PER_LINUX);
-#endif
- setup_new_exec(bprm);
-
- current->mm->end_code = ex.a_text +
- (current->mm->start_code = N_TXTADDR(ex));
- current->mm->end_data = ex.a_data +
- (current->mm->start_data = N_DATADDR(ex));
- current->mm->brk = ex.a_bss +
- (current->mm->start_brk = N_BSSADDR(ex));
-
- retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0)
- return retval;
-
-
- if (N_MAGIC(ex) == OMAGIC) {
- unsigned long text_addr, map_size;
- loff_t pos;
-
- text_addr = N_TXTADDR(ex);
-
-#ifdef __alpha__
- pos = fd_offset;
- map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
-#else
- pos = 32;
- map_size = ex.a_text+ex.a_data;
-#endif
- error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error)
- return error;
-
- error = read_code(bprm->file, text_addr, pos,
- ex.a_text+ex.a_data);
- if ((signed long)error < 0)
- return error;
- } else {
- if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
- (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
- {
- printk(KERN_NOTICE "executable not page aligned\n");
- }
-
- if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
- {
- printk(KERN_WARNING
- "fd_offset is not page aligned. Please convert program: %pD\n",
- bprm->file);
- }
-
- if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
- error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
- if (error)
- return error;
-
- read_code(bprm->file, N_TXTADDR(ex), fd_offset,
- ex.a_text + ex.a_data);
- goto beyond_if;
- }
-
- error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
- PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE,
- fd_offset);
-
- if (error != N_TXTADDR(ex))
- return error;
-
- error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE,
- fd_offset + ex.a_text);
- if (error != N_DATADDR(ex))
- return error;
- }
-beyond_if:
- set_binfmt(&aout_format);
-
- retval = set_brk(current->mm->start_brk, current->mm->brk);
- if (retval < 0)
- return retval;
-
- current->mm->start_stack =
- (unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
-#ifdef __alpha__
- regs->gp = ex.a_gpvalue;
-#endif
- finalize_exec(bprm);
- start_thread(regs, ex.a_entry, current->mm->start_stack);
- return 0;
-}
-
-static int load_aout_library(struct file *file)
-{
- struct inode * inode;
- unsigned long bss, start_addr, len;
- unsigned long error;
- int retval;
- struct exec ex;
- loff_t pos = 0;
-
- inode = file_inode(file);
-
- retval = -ENOEXEC;
- error = kernel_read(file, &ex, sizeof(ex), &pos);
- if (error != sizeof(ex))
- goto out;
-
- /* We come in here for the regular a.out style of shared libraries */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
- N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
- i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- goto out;
- }
-
- /*
- * Requires a mmap handler. This prevents people from using a.out
- * as part of an exploit attack against /proc-related vulnerabilities.
- */
- if (!file->f_op->mmap)
- goto out;
-
- if (N_FLAGS(ex))
- goto out;
-
- /* For QMAGIC, the starting address is 0x20 into the page. We mask
- this off to get the starting address for the page */
-
- start_addr = ex.a_entry & 0xfffff000;
-
- if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
- if (printk_ratelimit())
- {
- printk(KERN_WARNING
- "N_TXTOFF is not page aligned. Please convert library: %pD\n",
- file);
- }
- retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
- if (retval)
- goto out;
-
- read_code(file, start_addr, N_TXTOFF(ex),
- ex.a_text + ex.a_data);
- retval = 0;
- goto out;
- }
- /* Now use mmap to map the library into memory. */
- error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE,
- N_TXTOFF(ex));
- retval = error;
- if (error != start_addr)
- goto out;
-
- len = PAGE_ALIGN(ex.a_text + ex.a_data);
- bss = ex.a_text + ex.a_data + ex.a_bss;
- if (bss > len) {
- retval = vm_brk(start_addr + len, bss - len);
- if (retval)
- goto out;
- }
- retval = 0;
-out:
- return retval;
-}
-
-static int __init init_aout_binfmt(void)
-{
- register_binfmt(&aout_format);
- return 0;
-}
-
-static void __exit exit_aout_binfmt(void)
-{
- unregister_binfmt(&aout_format);
-}
-
-core_initcall(init_aout_binfmt);
-module_exit(exit_aout_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6556e13ed95f..63c7ebb0da89 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1117,11 +1117,11 @@ out_free_interp:
* independently randomized mmap region (0 load_bias
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
- alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
- if (interpreter || alignment > ELF_MIN_ALIGN) {
+ if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED_NOREPLACE;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 626898150011..c26545d71d39 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -37,7 +37,6 @@
#include <linux/flat.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
-#include <linux/coredump.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
@@ -69,11 +68,7 @@
#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
#define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-#define MAX_SHARED_LIBS (4)
-#else
-#define MAX_SHARED_LIBS (1)
-#endif
+#define MAX_SHARED_LIBS (1)
#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
#define DATA_START_OFFSET_WORDS (0)
@@ -93,38 +88,13 @@ struct lib_info {
} lib_list[MAX_SHARED_LIBS];
};
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-static int load_flat_shared_library(int id, struct lib_info *p);
-#endif
-
static int load_flat_binary(struct linux_binprm *);
-#ifdef CONFIG_COREDUMP
-static int flat_core_dump(struct coredump_params *cprm);
-#endif
static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
.load_binary = load_flat_binary,
-#ifdef CONFIG_COREDUMP
- .core_dump = flat_core_dump,
- .min_coredump = PAGE_SIZE
-#endif
};
-/****************************************************************************/
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- */
-
-#ifdef CONFIG_COREDUMP
-static int flat_core_dump(struct coredump_params *cprm)
-{
- pr_warn("Process %s:%d received signr %d and should have core dumped\n",
- current->comm, current->pid, cprm->siginfo->si_signo);
- return 1;
-}
-#endif
/****************************************************************************/
/*
@@ -329,51 +299,18 @@ out_free:
/****************************************************************************/
static unsigned long
-calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
+calc_reloc(unsigned long r, struct lib_info *p)
{
unsigned long addr;
- int id;
unsigned long start_brk;
unsigned long start_data;
unsigned long text_len;
unsigned long start_code;
-#ifdef CONFIG_BINFMT_SHARED_FLAT
- if (r == 0)
- id = curid; /* Relocs of 0 are always self referring */
- else {
- id = (r >> 24) & 0xff; /* Find ID for this reloc */
- r &= 0x00ffffff; /* Trim ID off here */
- }
- if (id >= MAX_SHARED_LIBS) {
- pr_err("reference 0x%lx to shared library %d", r, id);
- goto failed;
- }
- if (curid != id) {
- if (internalp) {
- pr_err("reloc address 0x%lx not in same module "
- "(%d != %d)", r, curid, id);
- goto failed;
- } else if (!p->lib_list[id].loaded &&
- load_flat_shared_library(id, p) < 0) {
- pr_err("failed to load library %d", id);
- goto failed;
- }
- /* Check versioning information (i.e. time stamps) */
- if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
- p->lib_list[curid].build_date < p->lib_list[id].build_date) {
- pr_err("library %d is younger than %d", id, curid);
- goto failed;
- }
- }
-#else
- id = 0;
-#endif
-
- start_brk = p->lib_list[id].start_brk;
- start_data = p->lib_list[id].start_data;
- start_code = p->lib_list[id].start_code;
- text_len = p->lib_list[id].text_len;
+ start_brk = p->lib_list[0].start_brk;
+ start_data = p->lib_list[0].start_data;
+ start_code = p->lib_list[0].start_code;
+ text_len = p->lib_list[0].text_len;
if (r > start_brk - start_data + text_len) {
pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)",
@@ -440,8 +377,32 @@ static void old_reloc(unsigned long rl)
/****************************************************************************/
+static inline u32 __user *skip_got_header(u32 __user *rp)
+{
+ if (IS_ENABLED(CONFIG_RISCV)) {
+ /*
+ * RISC-V has a 16 byte GOT PLT header for elf64-riscv
+ * and 8 byte GOT PLT header for elf32-riscv.
+ * Skip the whole GOT PLT header, since it is reserved
+ * for the dynamic linker (ld.so).
+ */
+ u32 rp_val0, rp_val1;
+
+ if (get_user(rp_val0, rp))
+ return rp;
+ if (get_user(rp_val1, rp + 1))
+ return rp;
+
+ if (rp_val0 == 0xffffffff && rp_val1 == 0xffffffff)
+ rp += 4;
+ else if (rp_val0 == 0xffffffff)
+ rp += 2;
+ }
+ return rp;
+}
+
static int load_flat_file(struct linux_binprm *bprm,
- struct lib_info *libinfo, int id, unsigned long *extra_stack)
+ struct lib_info *libinfo, unsigned long *extra_stack)
{
struct flat_hdr *hdr;
unsigned long textpos, datapos, realdatastart;
@@ -493,14 +454,6 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- /* Don't allow old format executables to use shared libraries */
- if (rev == OLD_FLAT_VERSION && id != 0) {
- pr_err("shared libraries are not available before rev 0x%lx\n",
- FLAT_VERSION);
- ret = -ENOEXEC;
- goto err;
- }
-
/*
* fix up the flags for the older format, there were all kinds
* of endian hacks, this only works for the simple cases
@@ -551,15 +504,13 @@ static int load_flat_file(struct linux_binprm *bprm,
}
/* Flush all traces of the currently running executable */
- if (id == 0) {
- ret = begin_new_exec(bprm);
- if (ret)
- goto err;
+ ret = begin_new_exec(bprm);
+ if (ret)
+ goto err;
- /* OK, This is the point of no return */
- set_personality(PER_LINUX_32BIT);
- setup_new_exec(bprm);
- }
+ /* OK, This is the point of no return */
+ set_personality(PER_LINUX_32BIT);
+ setup_new_exec(bprm);
/*
* calculate the extra space we need to map in
@@ -739,42 +690,40 @@ static int load_flat_file(struct linux_binprm *bprm,
text_len -= sizeof(struct flat_hdr); /* the real code len */
/* The main program needs a little extra setup in the task structure */
- if (id == 0) {
- current->mm->start_code = start_code;
- current->mm->end_code = end_code;
- current->mm->start_data = datapos;
- current->mm->end_data = datapos + data_len;
- /*
- * set up the brk stuff, uses any slack left in data/bss/stack
- * allocation. We put the brk after the bss (between the bss
- * and stack) like other platforms.
- * Userspace code relies on the stack pointer starting out at
- * an address right at the end of a page.
- */
- current->mm->start_brk = datapos + data_len + bss_len;
- current->mm->brk = (current->mm->start_brk + 3) & ~3;
+ current->mm->start_code = start_code;
+ current->mm->end_code = end_code;
+ current->mm->start_data = datapos;
+ current->mm->end_data = datapos + data_len;
+ /*
+ * set up the brk stuff, uses any slack left in data/bss/stack
+ * allocation. We put the brk after the bss (between the bss
+ * and stack) like other platforms.
+ * Userspace code relies on the stack pointer starting out at
+ * an address right at the end of a page.
+ */
+ current->mm->start_brk = datapos + data_len + bss_len;
+ current->mm->brk = (current->mm->start_brk + 3) & ~3;
#ifndef CONFIG_MMU
- current->mm->context.end_brk = memp + memp_size - stack_len;
+ current->mm->context.end_brk = memp + memp_size - stack_len;
#endif
- }
if (flags & FLAT_FLAG_KTRACE) {
pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n",
textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n",
- id ? "Lib" : "Load", bprm->filename,
+ "Load", bprm->filename,
start_code, end_code, datapos, datapos + data_len,
datapos + data_len, (datapos + data_len + bss_len + 3) & ~3);
}
/* Store the current module values into the global library structure */
- libinfo->lib_list[id].start_code = start_code;
- libinfo->lib_list[id].start_data = datapos;
- libinfo->lib_list[id].start_brk = datapos + data_len + bss_len;
- libinfo->lib_list[id].text_len = text_len;
- libinfo->lib_list[id].loaded = 1;
- libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
- libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
+ libinfo->lib_list[0].start_code = start_code;
+ libinfo->lib_list[0].start_data = datapos;
+ libinfo->lib_list[0].start_brk = datapos + data_len + bss_len;
+ libinfo->lib_list[0].text_len = text_len;
+ libinfo->lib_list[0].loaded = 1;
+ libinfo->lib_list[0].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
+ libinfo->lib_list[0].build_date = ntohl(hdr->build_date);
/*
* We just load the allocations into some temporary memory to
@@ -789,14 +738,15 @@ static int load_flat_file(struct linux_binprm *bprm,
* image.
*/
if (flags & FLAT_FLAG_GOTPIC) {
- for (rp = (u32 __user *)datapos; ; rp++) {
+ rp = skip_got_header((u32 __user *) datapos);
+ for (; ; rp++) {
u32 addr, rp_val;
if (get_user(rp_val, rp))
return -EFAULT;
if (rp_val == 0xffffffff)
break;
if (rp_val) {
- addr = calc_reloc(rp_val, libinfo, id, 0);
+ addr = calc_reloc(rp_val, libinfo);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -832,7 +782,7 @@ static int load_flat_file(struct linux_binprm *bprm,
return -EFAULT;
relval = ntohl(tmp);
addr = flat_get_relocate_addr(relval);
- rp = (u32 __user *)calc_reloc(addr, libinfo, id, 1);
+ rp = (u32 __user *)calc_reloc(addr, libinfo);
if (rp == (u32 __user *)RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -855,7 +805,7 @@ static int load_flat_file(struct linux_binprm *bprm,
*/
addr = ntohl((__force __be32)addr);
}
- addr = calc_reloc(addr, libinfo, id, 0);
+ addr = calc_reloc(addr, libinfo);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
@@ -883,7 +833,7 @@ static int load_flat_file(struct linux_binprm *bprm,
/* zero the BSS, BRK and stack areas */
if (clear_user((void __user *)(datapos + data_len), bss_len +
(memp + memp_size - stack_len - /* end brk */
- libinfo->lib_list[id].start_brk) + /* start brk */
+ libinfo->lib_list[0].start_brk) + /* start brk */
stack_len))
return -EFAULT;
@@ -894,49 +844,6 @@ err:
/****************************************************************************/
-#ifdef CONFIG_BINFMT_SHARED_FLAT
-
-/*
- * Load a shared library into memory. The library gets its own data
- * segment (including bss) but not argv/argc/environ.
- */
-
-static int load_flat_shared_library(int id, struct lib_info *libs)
-{
- /*
- * This is a fake bprm struct; only the members "buf", "file" and
- * "filename" are actually used.
- */
- struct linux_binprm bprm;
- int res;
- char buf[16];
- loff_t pos = 0;
-
- memset(&bprm, 0, sizeof(bprm));
-
- /* Create the file name */
- sprintf(buf, "/lib/lib%d.so", id);
-
- /* Open the file up */
- bprm.filename = buf;
- bprm.file = open_exec(bprm.filename);
- res = PTR_ERR(bprm.file);
- if (IS_ERR(bprm.file))
- return res;
-
- res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos);
-
- if (res >= 0)
- res = load_flat_file(&bprm, libs, id, NULL);
-
- allow_write_access(bprm.file);
- fput(bprm.file);
-
- return res;
-}
-
-#endif /* CONFIG_BINFMT_SHARED_FLAT */
-/****************************************************************************/
/*
* These are the functions used to load flat style executables and shared
@@ -968,7 +875,7 @@ static int load_flat_binary(struct linux_binprm *bprm)
stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN);
- res = load_flat_file(bprm, &libinfo, 0, &stack_len);
+ res = load_flat_file(bprm, &libinfo, &stack_len);
if (res < 0)
return res;
@@ -1013,20 +920,6 @@ static int load_flat_binary(struct linux_binprm *bprm)
*/
start_addr = libinfo.lib_list[0].entry;
-#ifdef CONFIG_BINFMT_SHARED_FLAT
- for (i = MAX_SHARED_LIBS-1; i > 0; i--) {
- if (libinfo.lib_list[i].loaded) {
- /* Push previos first to call address */
- unsigned long __user *sp;
- current->mm->start_stack -= sizeof(unsigned long);
- sp = (unsigned long __user *)current->mm->start_stack;
- if (put_user(start_addr, sp))
- return -EFAULT;
- start_addr = libinfo.lib_list[i].entry;
- }
- }
-#endif
-
#ifdef FLAT_PLAT_INIT
FLAT_PLAT_INIT(regs);
#endif
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 99f9995670ea..fa9ddcc9eb0b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o tree-mod-log.o
+ subpage.o tree-mod-log.o extent-io-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0a0d0eccee4e..548d6a5477b4 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
- struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl, int type)
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
- ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
+ ret = __btrfs_set_acl(NULL, inode, acl, type);
if (ret)
inode->i_mode = old_mode;
return ret;
}
-
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- int ret = 0;
-
- /* this happens with subvols */
- if (!dir)
- return 0;
-
- ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (ret)
- return ret;
-
- if (default_acl) {
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- }
-
- if (acl) {
- if (!ret)
- ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
- ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- }
-
- if (!default_acl && !acl)
- cache_no_acl(inode);
- return ret;
-}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 43c89952b7d2..aac240430efe 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -15,13 +15,12 @@
enum {
WORK_DONE_BIT,
WORK_ORDER_DONE_BIT,
- WORK_HIGH_PRIO_BIT,
};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
-struct __btrfs_workqueue {
+struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
/* File system this workqueue services */
@@ -48,12 +47,7 @@ struct __btrfs_workqueue {
spinlock_t thres_lock;
};
-struct btrfs_workqueue {
- struct __btrfs_workqueue *normal;
- struct __btrfs_workqueue *high;
-};
-
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
return wq->fs_info;
}
@@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
/*
- * We could compare wq->normal->pending with num_online_cpus()
+ * We could compare wq->pending with num_online_cpus()
* to support "thresh == NO_THRESHOLD" case, but it requires
* moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
* postpone it until someone needs the support of that case.
*/
- if (wq->normal->thresh == NO_THRESHOLD)
+ if (wq->thresh == NO_THRESHOLD)
return false;
- return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
+ return atomic_read(&wq->pending) > wq->thresh * 2;
}
-static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
- unsigned int flags, int limit_active, int thresh)
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name, unsigned int flags,
+ int limit_active, int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
ret->thresh = thresh;
}
- if (flags & WQ_HIGHPRI)
- ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
- ret->current_active, name);
- else
- ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
- ret->current_active, name);
+ ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
+ name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
@@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
INIT_LIST_HEAD(&ret->ordered_list);
spin_lock_init(&ret->list_lock);
spin_lock_init(&ret->thres_lock);
- trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
- return ret;
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-
-struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
- const char *name,
- unsigned int flags,
- int limit_active,
- int thresh)
-{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
-
- if (!ret)
- return NULL;
-
- ret->normal = __btrfs_alloc_workqueue(fs_info, name,
- flags & ~WQ_HIGHPRI,
- limit_active, thresh);
- if (!ret->normal) {
- kfree(ret);
- return NULL;
- }
-
- if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
- limit_active, thresh);
- if (!ret->high) {
- __btrfs_destroy_workqueue(ret->normal);
- kfree(ret);
- return NULL;
- }
- }
+ trace_btrfs_workqueue_alloc(ret, name);
return ret;
}
@@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
* This hook WILL be called in IRQ handler context,
* so workqueue_set_max_active MUST NOT be called in this hook
*/
-static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
if (wq->thresh == NO_THRESHOLD)
return;
@@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
* This hook is called in kthread content.
* So workqueue_set_max_active is called here.
*/
-static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
+static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
@@ -217,7 +173,7 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq,
+static void run_ordered_work(struct btrfs_workqueue *wq,
struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
@@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
{
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq = work->wq;
int need_order = 0;
/*
@@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work)
*/
if (work->ordered_func)
need_order = 1;
- wq = work->wq;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
@@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
work->flags = 0;
}
-static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
- struct btrfs_work *work)
+void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
unsigned long flags;
@@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
queue_work(wq->normal_wq, &work->normal_work);
}
-void btrfs_queue_work(struct btrfs_workqueue *wq,
- struct btrfs_work *work)
-{
- struct __btrfs_workqueue *dest_wq;
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
- dest_wq = wq->high;
- else
- dest_wq = wq->normal;
- __btrfs_queue_work(dest_wq, work);
-}
-
-static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
-{
- destroy_workqueue(wq->normal_wq);
- trace_btrfs_workqueue_destroy(wq);
- kfree(wq);
-}
-
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
- if (wq->high)
- __btrfs_destroy_workqueue(wq->high);
- __btrfs_destroy_workqueue(wq->normal);
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
kfree(wq);
}
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
- if (!wq)
- return;
- wq->normal->limit_active = limit_active;
- if (wq->high)
- wq->high->limit_active = limit_active;
-}
-
-void btrfs_set_work_high_priority(struct btrfs_work *work)
-{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ if (wq)
+ wq->limit_active = limit_active;
}
void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
- if (wq->high)
- flush_workqueue(wq->high->normal_wq);
-
- flush_workqueue(wq->normal->normal_wq);
+ flush_workqueue(wq->normal_wq);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 3204daa51b95..6e2596ddae10 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -11,11 +11,8 @@
struct btrfs_fs_info;
struct btrfs_workqueue;
-/* Internal use only */
-struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-typedef void (*btrfs_work_func_t)(struct work_struct *arg);
struct btrfs_work {
btrfs_func_t func;
@@ -25,7 +22,7 @@ struct btrfs_work {
/* Don't touch things below */
struct work_struct normal_work;
struct list_head ordered_list;
- struct __btrfs_workqueue *wq;
+ struct btrfs_workqueue *wq;
unsigned long flags;
};
@@ -40,9 +37,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
-void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ebc392ea1d74..dce3a16996b9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1511,16 +1511,118 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
-/**
- * Check if an extent is shared or not
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool *is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return false;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ entry = &cache->entries[level];
+
+ /* Unused cache entry or being used for some other extent buffer. */
+ if (entry->bytenr != bytenr)
+ return false;
+
+ /*
+ * We cached a false result, but the last snapshot generation of the
+ * root changed, so we now have a snapshot. Don't trust the result.
+ */
+ if (!entry->is_shared &&
+ entry->gen != btrfs_root_last_snapshot(&root->root_item))
+ return false;
+
+ /*
+ * If we cached a true result and the last generation used for dropping
+ * a root changed, we can not trust the result, because the dropped root
+ * could be a snapshot sharing this extent buffer.
+ */
+ if (entry->is_shared &&
+ entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+ return false;
+
+ *is_shared = entry->is_shared;
+
+ return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+ struct btrfs_root *root,
+ u64 bytenr, int level, bool is_shared)
+{
+ struct btrfs_backref_shared_cache_entry *entry;
+ u64 gen;
+
+ if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+ return;
+
+ /*
+ * Level -1 is used for the data extent, which is not reliable to cache
+ * because its reference count can increase or decrease without us
+ * realizing. We cache results only for extent buffers that lead from
+ * the root node down to the leaf with the file extent item.
+ */
+ ASSERT(level >= 0);
+
+ if (is_shared)
+ gen = btrfs_get_last_root_drop_gen(root->fs_info);
+ else
+ gen = btrfs_root_last_snapshot(&root->root_item);
+
+ entry = &cache->entries[level];
+ entry->bytenr = bytenr;
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+
+ /*
+ * If we found an extent buffer is shared, set the cache result for all
+ * extent buffers below it to true. As nodes in the path are COWed,
+ * their sharedness is moved to their children, and if a leaf is COWed,
+ * then the sharedness of a data extent becomes direct, the refcount of
+ * data extent is increased in the extent item at the extent tree.
+ */
+ if (is_shared) {
+ for (int i = 0; i < level; i++) {
+ entry = &cache->entries[i];
+ entry->is_shared = is_shared;
+ entry->gen = gen;
+ }
+ }
+}
+
+/*
+ * Check if a data extent is shared or not.
*
- * @root: root inode belongs to
- * @inum: inode number of the inode whose extent we are checking
- * @bytenr: logical bytenr of the extent we are checking
- * @roots: list of roots this extent is shared among
- * @tmp: temporary list used for iteration
+ * @root: The root the inode belongs to.
+ * @inum: Number of the inode whose extent we are checking.
+ * @bytenr: Logical bytenr of the extent we are checking.
+ * @extent_gen: Generation of the extent (file extent item) or 0 if it is
+ * not known.
+ * @roots: List of roots this extent is shared among.
+ * @tmp: Temporary list used for iteration.
+ * @cache: A backref lookup result cache.
*
- * btrfs_check_shared uses the backref walking code but will short
+ * btrfs_is_data_extent_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
* one passed in. This provides a significant performance benefit for
* callers (such as fiemap) which want to know whether the extent is
@@ -1531,8 +1633,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp)
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
@@ -1545,6 +1649,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
.inum = inum,
.share_count = 0,
};
+ int level;
ulist_init(roots);
ulist_init(tmp);
@@ -1561,22 +1666,52 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
btrfs_get_tree_mod_seq(fs_info, &elem);
}
+ /* -1 means we are in the bytenr of the data extent. */
+ level = -1;
ULIST_ITER_INIT(&uiter);
while (1) {
+ bool is_shared;
+ bool cached;
+
ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
roots, NULL, &shared, false);
if (ret == BACKREF_FOUND_SHARED) {
/* this is the only condition under which we return 1 */
ret = 1;
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, true);
break;
}
if (ret < 0 && ret != -ENOENT)
break;
ret = 0;
+ /*
+ * If our data extent is not shared through reflinks and it was
+ * created in a generation after the last one used to create a
+ * snapshot of the inode's root, then it can not be shared
+ * indirectly through subtrees, as that can only happen with
+ * snapshots. In this case bail out, no need to check for the
+ * sharedness of extent buffers.
+ */
+ if (level == -1 &&
+ extent_gen > btrfs_root_last_snapshot(&root->root_item))
+ break;
+
+ if (level >= 0)
+ store_backref_shared_cache(cache, root, bytenr,
+ level, false);
node = ulist_next(tmp, &uiter);
if (!node)
break;
bytenr = node->val;
+ level++;
+ cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+ &is_shared);
+ if (cached) {
+ ret = (is_shared ? 1 : 0);
+ break;
+ }
shared.share_count = 0;
cond_resched();
}
@@ -2028,10 +2163,29 @@ out:
return ret;
}
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+ struct btrfs_data_container *inodes = ctx;
+ const size_t c = 3 * sizeof(u64);
+
+ if (inodes->bytes_left >= c) {
+ inodes->bytes_left -= c;
+ inodes->val[inodes->elem_cnt] = inum;
+ inodes->val[inodes->elem_cnt + 1] = offset;
+ inodes->val[inodes->elem_cnt + 2] = root;
+ inodes->elem_cnt += 3;
+ } else {
+ inodes->bytes_missing += c - inodes->bytes_left;
+ inodes->bytes_left = 0;
+ inodes->elem_missed += 3;
+ }
+
+ return 0;
+}
+
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
- bool ignore_offset)
+ void *ctx, bool ignore_offset)
{
int ret;
u64 extent_item_pos;
@@ -2049,17 +2203,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, search_commit_root,
- iterate, ctx, ignore_offset);
+ build_ino_list, ctx, ignore_offset);
return ret;
}
-typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx);
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb, struct inode_fs_paths *ipath);
-static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath)
{
int ret = 0;
int slot;
@@ -2068,6 +2220,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
u32 name_len;
u64 parent = 0;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_ref *iref;
struct btrfs_key found_key;
@@ -2103,8 +2257,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
"following ref at offset %u for inode %llu in tree %llu",
cur, found_key.objectid,
fs_root->root_key.objectid);
- ret = iterate(parent, name_len,
- (unsigned long)(iref + 1), eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)(iref + 1), eb, ipath);
if (ret)
break;
len = sizeof(*iref) + name_len;
@@ -2118,15 +2272,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- iterate_irefs_t *iterate, void *ctx)
+static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath)
{
int ret;
int slot;
u64 offset = 0;
u64 parent;
int found = 0;
+ struct btrfs_root *fs_root = ipath->fs_root;
+ struct btrfs_path *path = ipath->btrfs_path;
struct extent_buffer *eb;
struct btrfs_inode_extref *extref;
u32 item_size;
@@ -2162,8 +2316,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
name_len = btrfs_inode_extref_name_len(eb, extref);
- ret = iterate(parent, name_len,
- (unsigned long)&extref->name, eb, ctx);
+ ret = inode_to_path(parent, name_len,
+ (unsigned long)&extref->name, eb, ipath);
if (ret)
break;
@@ -2180,34 +2334,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
return ret;
}
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
- struct btrfs_path *path, iterate_irefs_t *iterate,
- void *ctx)
-{
- int ret;
- int found_refs = 0;
-
- ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
- if (!ret)
- ++found_refs;
- else if (ret != -ENOENT)
- return ret;
-
- ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
- if (ret == -ENOENT && found_refs)
- return 0;
-
- return ret;
-}
-
/*
* returns 0 if the path could be dumped (probably truncated)
* returns <0 in case of an error
*/
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
- struct extent_buffer *eb, void *ctx)
+ struct extent_buffer *eb, struct inode_fs_paths *ipath)
{
- struct inode_fs_paths *ipath = ctx;
char *fspath;
char *fspath_min;
int i = ipath->fspath->elem_cnt;
@@ -2248,8 +2381,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
*/
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
{
- return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
- inode_to_path, ipath);
+ int ret;
+ int found_refs = 0;
+
+ ret = iterate_inode_refs(inum, ipath);
+ if (!ret)
+ ++found_refs;
+ else if (ret != -ENOENT)
+ return ret;
+
+ ret = iterate_inode_extrefs(inum, ipath);
+ if (ret == -ENOENT && found_refs)
+ return 0;
+
+ return ret;
}
struct btrfs_data_container *init_data_container(u32 total_bytes)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ba454032dbe2..52ae6957b414 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -17,6 +17,20 @@ struct inode_fs_paths {
struct btrfs_data_container *fspath;
};
+struct btrfs_backref_shared_cache_entry {
+ u64 bytenr;
+ u64 gen;
+ bool is_shared;
+};
+
+struct btrfs_backref_shared_cache {
+ /*
+ * A path from a root to a leaf that has a file extent item pointing to
+ * a given data extent should never exceed the maximum b+tree height.
+ */
+ struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
+};
+
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
@@ -35,8 +49,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
bool ignore_offset);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
- iterate_extent_inodes_t *iterate, void *ctx,
+ struct btrfs_path *path, void *ctx,
bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
@@ -63,8 +76,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
- struct ulist *roots, struct ulist *tmp_ulist);
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp,
+ struct btrfs_backref_shared_cache *cache);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c22d287e020b..32c415cfbdfe 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
struct rb_node **p;
struct rb_node *parent = NULL;
struct btrfs_block_group *cache;
+ bool leftmost = true;
ASSERT(block_group->length != 0);
- spin_lock(&info->block_group_cache_lock);
- p = &info->block_group_cache_tree.rb_node;
+ write_lock(&info->block_group_cache_lock);
+ p = &info->block_group_cache_tree.rb_root.rb_node;
while (*p) {
parent = *p;
@@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
p = &(*p)->rb_left;
} else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return -EEXIST;
}
}
rb_link_node(&block_group->cache_node, parent, p);
- rb_insert_color(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_insert_color_cached(&block_group->cache_node,
+ &info->block_group_cache_tree, leftmost);
- if (info->first_logical_byte > block_group->start)
- info->first_logical_byte = block_group->start;
-
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
return 0;
}
@@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
struct rb_node *n;
u64 end, start;
- spin_lock(&info->block_group_cache_lock);
- n = info->block_group_cache_tree.rb_node;
+ read_lock(&info->block_group_cache_lock);
+ n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
@@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
break;
}
}
- if (ret) {
+ if (ret)
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->start)
- info->first_logical_byte = ret->start;
- }
- spin_unlock(&info->block_group_cache_lock);
+ read_unlock(&info->block_group_cache_lock);
return ret;
}
@@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
- spin_lock(&fs_info->block_group_cache_lock);
+ read_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
- cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
+ return btrfs_lookup_first_block_group(fs_info, next_bytenr);
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group(
btrfs_get_block_group(cache);
} else
cache = NULL;
- spin_unlock(&fs_info->block_group_cache_lock);
+ read_unlock(&fs_info->block_group_cache_lock);
return cache;
}
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Check if we can do a NOCOW write for a given extent.
+ *
+ * @fs_info: The filesystem information object.
+ * @bytenr: Logical start address of the extent.
+ *
+ * Check if we can do a NOCOW write for the given extent, and increments the
+ * number of NOCOW writers in the block group that contains the extent, as long
+ * as the block group exists and it's currently not in read-only mode.
+ *
+ * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
+ * is responsible for calling btrfs_dec_nocow_writers() later.
+ *
+ * Or NULL if we can not do a NOCOW write
+ */
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr)
{
struct btrfs_block_group *bg;
- bool ret = true;
+ bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
- return false;
+ return NULL;
spin_lock(&bg->lock);
if (bg->ro)
- ret = false;
+ can_nocow = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
- /* No put on block group, done by btrfs_dec_nocow_writers */
- if (!ret)
+ if (!can_nocow) {
btrfs_put_block_group(bg);
+ return NULL;
+ }
- return ret;
+ /* No put on block group, done by btrfs_dec_nocow_writers(). */
+ return bg;
}
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+/**
+ * Decrement the number of NOCOW writers in a block group.
+ *
+ * @bg: The block group.
+ *
+ * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
+ * and on the block group returned by that call. Typically this is called after
+ * creating an ordered extent for a NOCOW write, to prevent races with scrub and
+ * relocation.
+ *
+ * After this call, the caller should not use the block group anymore. It it wants
+ * to use it, then it should get a reference on it before calling this function.
+ */
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
- struct btrfs_block_group *bg;
-
- bg = btrfs_lookup_block_group(fs_info, bytenr);
- ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
- /*
- * Once for our lookup and once for the lookup done by a previous call
- * to btrfs_inc_nocow_writers()
- */
- btrfs_put_block_group(bg);
+
+ /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
@@ -420,39 +440,26 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
+static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
+ return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
+}
+
+static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
- int ret = 0;
+ int ret;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
-
- wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
- if (cache->cached == BTRFS_CACHE_ERROR)
- ret = -EIO;
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
btrfs_put_caching_control(caching_ctl);
return ret;
}
-static bool space_cache_v1_done(struct btrfs_block_group *cache)
-{
- bool ret;
-
- spin_lock(&cache->lock);
- ret = cache->cached != BTRFS_CACHE_FAST;
- spin_unlock(&cache->lock);
-
- return ret;
-}
-
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl)
-{
- wait_event(caching_ctl->wait, space_cache_v1_done(cache));
-}
-
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
@@ -586,8 +593,6 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -611,9 +616,6 @@ next:
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (wakeup)
- caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -648,7 +650,6 @@ next:
total_found += add_new_free_space(block_group, last,
block_group->start + block_group->length);
- caching_ctl->progress = (u64)-1;
out:
btrfs_free_path(path);
@@ -718,8 +719,6 @@ done:
}
#endif
- caching_ctl->progress = (u64)-1;
-
up_read(&fs_info->commit_root_sem);
btrfs_free_excluded_extents(block_group);
mutex_unlock(&caching_ctl->mutex);
@@ -730,9 +729,8 @@ done:
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
- DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
@@ -749,7 +747,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->start;
refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
@@ -765,24 +762,22 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- if (btrfs_test_opt(fs_info, SPACE_CACHE))
- cache->cached = BTRFS_CACHE_FAST;
- else
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
+ cache->cached = BTRFS_CACHE_STARTED;
spin_unlock(&cache->lock);
- spin_lock(&fs_info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
- if (load_cache_only && caching_ctl)
- btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
+ /* REVIEW */
+ if (wait && caching_ctl)
+ ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
+ /* wait_event(caching_ctl->wait, space_cache_v1_done(cache)); */
if (caching_ctl)
btrfs_put_caching_control(caching_ctl);
@@ -957,17 +952,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- spin_lock(&fs_info->block_group_cache_lock);
- rb_erase(&block_group->cache_node,
- &fs_info->block_group_cache_tree);
+ write_lock(&fs_info->block_group_cache_lock);
+ rb_erase_cached(&block_group->cache_node,
+ &fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
- if (fs_info->first_logical_byte == block_group->start)
- fs_info->first_logical_byte = (u64)-1;
- spin_unlock(&fs_info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@@ -987,32 +980,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
kobject_put(kobj);
}
- if (block_group->has_caching_ctl)
- caching_ctl = btrfs_get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
- if (block_group->has_caching_ctl) {
- spin_lock(&fs_info->block_group_cache_lock);
- if (!caching_ctl) {
- struct btrfs_caching_control *ctl;
-
- list_for_each_entry(ctl,
- &fs_info->caching_block_groups, list)
- if (ctl->block_group == block_group) {
- caching_ctl = ctl;
- refcount_inc(&caching_ctl->count);
- break;
- }
- }
- if (caching_ctl)
- list_del_init(&caching_ctl->list);
- spin_unlock(&fs_info->block_group_cache_lock);
- if (caching_ctl) {
- /* Once for the caching bgs list and once for us. */
- btrfs_put_caching_control(caching_ctl);
- btrfs_put_caching_control(caching_ctl);
+
+ write_lock(&fs_info->block_group_cache_lock);
+ caching_ctl = btrfs_get_caching_control(block_group);
+ if (!caching_ctl) {
+ struct btrfs_caching_control *ctl;
+
+ list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+ if (ctl->block_group == block_group) {
+ caching_ctl = ctl;
+ refcount_inc(&caching_ctl->count);
+ break;
+ }
}
}
+ if (caching_ctl)
+ list_del_init(&caching_ctl->list);
+ write_unlock(&fs_info->block_group_cache_lock);
+
+ if (caching_ctl) {
+ /* Once for the caching bgs list and once for us. */
+ btrfs_put_caching_control(caching_ctl);
+ btrfs_put_caching_control(caching_ctl);
+ }
spin_lock(&trans->transaction->dirty_bgs_lock);
WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1033,8 +1025,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
< block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
+ WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags) &&
+ block_group->space_info->active_total_bytes
+ < block_group->length);
}
block_group->space_info->total_bytes -= block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ block_group->space_info->active_total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
block_group->space_info->bytes_zone_unusable -=
@@ -1063,7 +1061,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
- block_group->removed = 1;
+ set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
/*
* At this point trimming or scrub can't start on this block group,
* because we removed the block group from the rbtree
@@ -1298,6 +1297,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
/*
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
@@ -1367,6 +1369,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ ret = btrfs_zone_finish(block_group);
+ if (ret < 0) {
+ btrfs_dec_block_group_ro(block_group);
+ if (ret == -EAGAIN)
+ ret = 0;
+ goto next;
+ }
+
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
@@ -1512,6 +1522,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1522,6 +1539,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (btrfs_fs_closing(fs_info))
+ return;
+
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
@@ -1599,9 +1622,11 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
- if (ret)
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
+ }
next:
btrfs_put_block_group(bg);
@@ -1692,35 +1717,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
struct btrfs_key found_key;
- struct extent_buffer *leaf;
- int slot;
-
- ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- while (1) {
- slot = path->slots[0];
- leaf = path->nodes[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
- break;
- }
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ btrfs_for_each_slot(root, key, &found_key, path, ret) {
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = read_bg_from_eb(fs_info, &found_key, path);
- break;
+ return read_bg_from_eb(fs_info, &found_key, path);
}
-
- path->slots[0]++;
}
-out:
return ret;
}
@@ -1802,11 +1805,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = stripe_nr * map->num_stripes + i;
}
/*
* The remaining case would be for RAID56, multiply by
@@ -1887,16 +1889,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
return 0;
}
-static void link_block_group(struct btrfs_block_group *cache)
-{
- struct btrfs_space_info *space_info = cache->space_info;
- int index = btrfs_bg_flags_to_raid_index(cache->flags);
-
- down_write(&space_info->groups_sem);
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
-}
-
static struct btrfs_block_group *btrfs_create_block_group_cache(
struct btrfs_fs_info *fs_info, u64 start)
{
@@ -1934,7 +1926,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
- btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+ cache->full_stripe_locks_root.root = RB_ROOT;
+ mutex_init(&cache->full_stripe_locks_root.lock);
return cache;
}
@@ -1999,7 +1992,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
int need_clear)
{
struct btrfs_block_group *cache;
- struct btrfs_space_info *space_info;
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
int ret;
@@ -2075,11 +2067,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
/* Should not have any excluded extents. Just in case, though. */
btrfs_free_excluded_extents(cache);
} else if (cache->length == cache->used) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
} else if (cache->used == 0) {
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, cache->start,
cache->start + cache->length);
@@ -2092,13 +2082,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super,
- cache->zone_unusable, &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
+ btrfs_add_bg_to_space_info(info, cache);
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_writeable(info, cache->start)) {
@@ -2122,7 +2106,6 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct btrfs_space_info *space_info;
struct rb_node *node;
int ret = 0;
@@ -2142,7 +2125,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
/* Fill dummy cache as FULL */
bg->length = em->len;
bg->flags = map->type;
- bg->last_byte_to_unpin = (u64)-1;
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = em->len;
bg->flags = map->type;
@@ -2163,10 +2145,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
- 0, 0, &space_info);
- bg->space_info = space_info;
- link_block_group(bg);
+ btrfs_add_bg_to_space_info(fs_info, bg);
set_avail_alloc_bits(fs_info, bg->flags);
}
@@ -2186,7 +2165,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
- if (!root)
+ /*
+ * Either no extent root (with ibadroots rescue option) or we have
+ * unsupported RO options. The fs can never be mounted read-write, so no
+ * need to waste time searching block group items.
+ *
+ * This also allows new extent tree related changes to be RO compat,
+ * no need for a full incompat flag.
+ */
+ if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+ ~BTRFS_FEATURE_COMPAT_RO_SUPP))
return fill_dummy_bgs(info);
key.objectid = 0;
@@ -2421,7 +2409,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
- if (!block_group->chunk_item_inserted) {
+ if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ &block_group->runtime_flags)) {
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2490,7 +2479,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2503,12 +2491,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
- /*
- * New block group is likely to be used soon. Try to activate it now.
- * Failure is OK for now.
- */
- btrfs_zone_activate(cache);
-
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2521,14 +2503,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
btrfs_free_excluded_extents(cache);
-#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_should_fragment_free_space(cache)) {
- u64 new_bytes_used = size - bytes_used;
-
- bytes_used += new_bytes_used >> 1;
- fragment_free_space(cache);
- }
-#endif
/*
* Ensure the corresponding space_info object is created and
* assigned to our block group. We want our bg to be added to the rbtree
@@ -2549,12 +2523,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
- btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, cache->zone_unusable,
- &cache->space_info);
+ btrfs_add_bg_to_space_info(fs_info, cache);
btrfs_update_global_block_rsv(fs_info);
- link_block_group(cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ cache->space_info->bytes_used += new_bytes_used >> 1;
+ fragment_free_space(cache);
+ }
+#endif
list_add_tail(&cache->bg_list, &trans->new_bgs);
trans->delayed_ref_updates++;
@@ -2651,6 +2630,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
+ /*
+ * We have allocated a new chunk. We also need to activate that chunk to
+ * grant metadata tickets for zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ if (ret < 0)
+ goto out;
+
ret = inc_block_group_ro(cache, 0);
if (ret == -ETXTBSY)
goto unlock_out;
@@ -2863,7 +2850,7 @@ again:
cache_size *= fs_info->sectorsize;
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
- cache_size);
+ cache_size, false);
if (ret)
goto out_put;
@@ -2946,7 +2933,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3012,7 +2998,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -3113,7 +3098,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3171,7 +3155,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -3230,6 +3213,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
+static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
+ u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = bg->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = bg->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = div_factor_fine(bg->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3252,6 +3260,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
+ bool reclaim;
+
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
@@ -3266,7 +3276,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && !btrfs_block_group_done(cache))
- btrfs_cache_block_group(cache, 1);
+ btrfs_cache_block_group(cache, true);
byte_in_group = bytenr - cache->start;
WARN_ON(byte_in_group > cache->length);
@@ -3297,6 +3307,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
+
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -3323,6 +3335,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
btrfs_put_block_group(cache);
@@ -3455,7 +3469,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{
struct btrfs_block_group *bg;
int ret;
@@ -3542,7 +3556,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
out:
btrfs_trans_release_chunk_metadata(trans);
- return ret;
+ if (ret)
+ return ERR_PTR(ret);
+
+ btrfs_get_block_group(bg);
+ return bg;
}
/*
@@ -3657,10 +3675,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *space_info;
+ struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
+ bool from_extent_allocation = false;
int ret = 0;
+ if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
+ from_extent_allocation = true;
+ force = CHUNK_ALLOC_FORCE;
+ }
+
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
@@ -3715,6 +3740,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
@@ -3750,9 +3776,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- ret = do_chunk_alloc(trans, flags);
+ ret_bg = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
+ if (IS_ERR(ret_bg)) {
+ ret = PTR_ERR(ret_bg);
+ } else if (from_extent_allocation) {
+ /*
+ * New block group is likely to be used soon. Try to activate
+ * it now. Failure is OK for now.
+ */
+ btrfs_zone_activate(ret_bg);
+ }
+
+ if (!ret)
+ btrfs_put_block_group(ret_bg);
+
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
@@ -3825,6 +3864,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
ret = PTR_ERR(bg);
} else {
/*
+ * We have a new chunk. We also need to activate it for
+ * zoned filesystem.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
+ if (ret < 0)
+ return;
+
+ /*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
@@ -3899,35 +3946,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
- u64 last = 0;
- while (1) {
- struct inode *inode;
+ block_group = btrfs_lookup_first_block_group(info, 0);
+ while (block_group) {
+ btrfs_wait_block_group_cache_done(block_group);
+ spin_lock(&block_group->lock);
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+ &block_group->runtime_flags)) {
+ struct inode *inode = block_group->inode;
- block_group = btrfs_lookup_first_block_group(info, last);
- while (block_group) {
- btrfs_wait_block_group_cache_done(block_group);
- spin_lock(&block_group->lock);
- if (block_group->iref)
- break;
+ block_group->inode = NULL;
spin_unlock(&block_group->lock);
- block_group = btrfs_next_block_group(block_group);
- }
- if (!block_group) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- inode = block_group->inode;
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
- last = block_group->start + block_group->length;
- btrfs_put_block_group(block_group);
+ ASSERT(block_group->io_ctl.inode == NULL);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ block_group = btrfs_next_block_group(block_group);
}
}
@@ -3943,14 +3979,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
@@ -3980,14 +4016,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->zone_active_bgs_lock);
- spin_lock(&info->block_group_cache_lock);
- while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+ write_lock(&info->block_group_cache_lock);
+ while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
- rb_erase(&block_group->cache_node,
- &info->block_group_cache_tree);
+ rb_erase_cached(&block_group->cache_node,
+ &info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
@@ -4010,9 +4046,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
- spin_lock(&info->block_group_cache_lock);
+ write_lock(&info->block_group_cache_lock);
}
- spin_unlock(&info->block_group_cache_lock);
+ write_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);
@@ -4063,7 +4099,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_lock(&block_group->lock);
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
- block_group->removed);
+ test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
spin_unlock(&block_group->lock);
if (cleanup) {
@@ -4084,7 +4120,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
* tasks trimming this block group have left 1 entry each one.
* Free them if any.
*/
- __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+ btrfs_remove_free_space_cache(block_group);
}
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 93aabc68bb6a..8fb14b99a1d1 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -35,11 +35,33 @@ enum btrfs_discard_state {
* the FS with empty chunks
*
* CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
+ * find_free_extent() that also activaes the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
CHUNK_ALLOC_LIMITED,
CHUNK_ALLOC_FORCE,
+ CHUNK_ALLOC_FORCE_FOR_EXTENT,
+};
+
+/* Block group flags set at runtime */
+enum btrfs_block_group_flags {
+ BLOCK_GROUP_FLAG_IREF,
+ BLOCK_GROUP_FLAG_REMOVED,
+ BLOCK_GROUP_FLAG_TO_COPY,
+ BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
+ BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+ BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+};
+
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO,
+ BTRFS_CACHE_STARTED,
+ BTRFS_CACHE_FINISHED,
+ BTRFS_CACHE_ERROR,
};
struct btrfs_caching_control {
@@ -48,13 +70,20 @@ struct btrfs_caching_control {
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
- u64 progress;
refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+ struct rb_root root;
+ struct mutex lock;
+};
+
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
@@ -91,22 +120,15 @@ struct btrfs_block_group {
/* For raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
+ unsigned long runtime_flags;
unsigned int ro;
- unsigned int iref:1;
- unsigned int has_caching_ctl:1;
- unsigned int removed:1;
- unsigned int to_copy:1;
- unsigned int relocating_repair:1;
- unsigned int chunk_item_inserted:1;
- unsigned int zone_is_active:1;
int disk_cache_state;
/* Cache tracking stuff */
int cached;
struct btrfs_caching_control *caching_ctl;
- u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
@@ -208,6 +230,8 @@ struct btrfs_block_group {
u64 meta_write_pointer;
struct map_lookup *physical_map;
struct list_head active_bg_list;
+ struct work_struct zone_finish_work;
+ struct extent_buffer *last_eb;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -250,14 +274,13 @@ void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
-bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
+ u64 bytenr);
+void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
-int btrfs_cache_block_group(struct btrfs_block_group *cache,
- int load_cache_only);
+int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
@@ -299,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
- struct btrfs_caching_control *caching_ctl);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
struct block_device *bdev, u64 physical, u64 **logical,
int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index b3ee49b0b1e8..ec96285357e0 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
block_rsv->reserved = block_rsv->size;
- block_rsv->full = 1;
+ block_rsv->full = true;
} else {
num_bytes = 0;
}
@@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
bytes_to_add = min(num_bytes, bytes_to_add);
dest->reserved += bytes_to_add;
if (dest->reserved >= dest->size)
- dest->full = 1;
+ dest->full = true;
num_bytes -= bytes_to_add;
}
spin_unlock(&dest->lock);
@@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
return 0;
}
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
@@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
btrfs_init_block_rsv(rsv, type);
rsv->space_info = btrfs_find_space_info(fs_info,
@@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type)
+ enum btrfs_rsv_type type)
{
struct btrfs_block_rsv *block_rsv;
@@ -286,7 +286,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
- else if (block_rsv != global_rsv && !delayed_rsv->full)
+ else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
@@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
if (block_rsv->reserved >= num_bytes) {
block_rsv->reserved -= num_bytes;
if (block_rsv->reserved < block_rsv->size)
- block_rsv->full = 0;
+ block_rsv->full = false;
ret = 0;
}
spin_unlock(&block_rsv->lock);
@@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
if (update_size)
block_rsv->size += num_bytes;
else if (block_rsv->reserved >= block_rsv->size)
- block_rsv->full = 1;
+ block_rsv->full = true;
spin_unlock(&block_rsv->lock);
}
@@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
}
global_rsv->reserved -= num_bytes;
if (global_rsv->reserved < global_rsv->size)
- global_rsv->full = 0;
+ global_rsv->full = false;
spin_unlock(&global_rsv->lock);
btrfs_block_rsv_add_bytes(dest, num_bytes, true);
@@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
btrfs_try_granting_tickets(fs_info, sinfo);
}
- if (block_rsv->reserved == block_rsv->size)
- block_rsv->full = 1;
- else
- block_rsv->full = 0;
+ block_rsv->full = (block_rsv->reserved == block_rsv->size);
if (block_rsv->size >= sinfo->total_bytes)
sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -427,6 +424,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
+ case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 3b67ff08d434..578c3497a455 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum;
/*
* Types of block reserves
*/
-enum {
+enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_GLOBAL,
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
@@ -25,9 +25,10 @@ struct btrfs_block_rsv {
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
- unsigned short full;
- unsigned short type;
- unsigned short failfast;
+ bool full;
+ bool failfast;
+ /* Block reserve type, one of BTRFS_BLOCK_RSV_* */
+ enum btrfs_rsv_type type:8;
/*
* Qgroup equivalent for @size @reserved
@@ -49,13 +50,13 @@ struct btrfs_block_rsv {
u64 qgroup_rsv_reserved;
};
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type);
void btrfs_init_root_block_rsv(struct btrfs_root *root);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
- unsigned short type);
+ enum btrfs_rsv_type type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
@@ -91,4 +92,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
+/*
+ * Fast path to check if the reserve is full, may be carefully used outside of
+ * locks.
+ */
+static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
+{
+ return data_race(rsv->full);
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 47e72d72f7d0..54c2ccb36b61 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -65,6 +65,8 @@ enum {
* on the same file.
*/
BTRFS_INODE_VERITY_IN_PROGRESS,
+ /* Set when this inode is a free space inode. */
+ BTRFS_INODE_FREE_SPACE_INODE,
};
/* in memory btrfs inode */
@@ -94,7 +96,8 @@ struct btrfs_inode {
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
- struct extent_io_tree io_failure_tree;
+ struct rb_root io_failure_tree;
+ spinlock_t io_failure_lock;
/*
* Keep track of where the inode has extent items mapped in order to
@@ -250,11 +253,6 @@ struct btrfs_inode {
struct inode vfs_inode;
};
-static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
-{
- return inode->root->fs_info->sectorsize;
-}
-
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -272,26 +270,31 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
return (unsigned long)h;
}
-static inline void btrfs_insert_inode_hash(struct inode *inode)
-{
- unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
-
- __insert_inode_hash(inode, h);
-}
+#if BITS_PER_LONG == 32
+/*
+ * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
+ * we use the inode's location objectid which is a u64 to avoid truncation.
+ */
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
u64 ino = inode->location.objectid;
- /*
- * !ino: btree_inode
- * type == BTRFS_ROOT_ITEM_KEY: subvol dir
- */
- if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
+ if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
ino = inode->vfs_inode.i_ino;
return ino;
}
+#else
+
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
+{
+ return inode->vfs_inode.i_ino;
+}
+
+#endif
+
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
@@ -300,14 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
- struct btrfs_root *root = inode->root;
-
- if (root == root->fs_info->tree_root &&
- btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
- return true;
- if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
- return true;
- return false;
+ return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
static inline bool is_data_inode(struct inode *inode)
@@ -384,30 +380,16 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
return ret;
}
-struct btrfs_dio_private {
- struct inode *inode;
-
- /*
- * Since DIO can use anonymous page, we cannot use page_offset() to
- * grab the file offset, thus need a dedicated member for file offset.
- */
- u64 file_offset;
- u64 disk_bytenr;
- /* Used for bio::bi_size */
- u32 bytes;
-
- /*
- * References to this structure. There is one reference per in-flight
- * bio plus one while we're still setting up.
- */
- refcount_t refs;
-
- /* dio_bio came from fs/direct-io.c */
- struct bio *dio_bio;
-
- /* Array of checksums */
- u8 csums[];
-};
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATACOW ||
+ inode->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
/*
* btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index abac86a75840..98c6e5feab19 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -152,7 +152,7 @@ struct btrfsic_block {
struct btrfsic_block *next_in_same_bio;
void *orig_bio_private;
bio_end_io_t *orig_bio_end_io;
- int submit_bio_bh_rw;
+ blk_opf_t submit_bio_bh_rw;
u64 flush_gen; /* only valid if !never_written */
};
@@ -1552,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
- for (i = 0; i < num_pages; i++) {
- block_ctx->pagev[i] = alloc_page(GFP_NOFS);
- if (!block_ctx->pagev[i])
- return -1;
- }
+ ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
+ if (ret)
+ return ret;
dev_bytenr = block_ctx->dev_bytenr;
for (i = 0; i < num_pages;) {
struct bio *bio;
unsigned int j;
- bio = btrfs_bio_alloc(num_pages - i);
- bio_set_dev(bio, block_ctx->dev->bdev);
+ bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
+ REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
- bio->bi_opf = REQ_OP_READ;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1684,7 +1681,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
- int submit_bio_bh_rw)
+ blk_opf_t submit_bio_bh_rw)
{
int is_metadata;
struct btrfsic_block *block;
@@ -2033,7 +2030,7 @@ continue_loop:
static void btrfsic_bio_end_io(struct bio *bp)
{
- struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ struct btrfsic_block *block = bp->bi_private;
int iodone_w_error;
/* mutex is not held! This is not save if IO is not yet completed
@@ -2635,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
-static void __btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- struct btrfsic_dev_state *dev_state;
+ unsigned int segs = bio_segments(bio);
+ u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
+ u64 cur_bytenr = dev_bytenr;
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ char **mapped_datav;
+ int bio_is_patched = 0;
+ int i = 0;
+
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info(
+"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, segs,
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
- if (!btrfsic_is_initialized)
+ mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
+ if (!mapped_datav)
return;
- mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bio() is also called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
- if (NULL != dev_state &&
- (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- int i = 0;
- u64 dev_bytenr;
- u64 cur_bytenr;
- struct bio_vec bvec;
- struct bvec_iter iter;
- int bio_is_patched;
- char **mapped_datav;
- unsigned int segs = bio_segments(bio);
-
- dev_bytenr = 512 * bio->bi_iter.bi_sector;
- bio_is_patched = 0;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
- bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
-
- mapped_datav = kmalloc_array(segs,
- sizeof(*mapped_datav), GFP_NOFS);
- if (!mapped_datav)
- goto leave;
- cur_bytenr = dev_bytenr;
-
- bio_for_each_segment(bvec, bio, iter) {
- BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = page_address(bvec.bv_page);
- i++;
-
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
- pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
- i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
- cur_bytenr += bvec.bv_len;
- }
- btrfsic_process_written_block(dev_state, dev_bytenr,
- mapped_datav, segs,
- bio, &bio_is_patched,
- bio->bi_opf);
- kfree(mapped_datav);
- } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_bdev);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info(
-"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->bdev);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
+ bio_for_each_segment(bvec, bio, iter) {
+ BUG_ON(bvec.bv_len != PAGE_SIZE);
+ mapped_datav[i] = page_address(bvec.bv_page);
+ i++;
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_private = bio->bi_private;
- block->orig_bio_end_io = bio->bi_end_io;
- block->next_in_same_bio = NULL;
- bio->bi_private = block;
- bio->bi_end_io = btrfsic_bio_end_io;
- }
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
+ pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
+ cur_bytenr += bvec.bv_len;
}
-leave:
- mutex_unlock(&btrfsic_mutex);
+
+ btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
+ bio, &bio_is_patched, bio->bi_opf);
+ kfree(mapped_datav);
}
-void btrfsic_submit_bio(struct bio *bio)
+static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
- __btrfsic_submit_bio(bio);
- submit_bio(bio);
+ if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
+
+ if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = bio->bi_opf;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ } else if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE))) {
+ pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+ dev_state->bdev);
+ }
}
-int btrfsic_submit_bio_wait(struct bio *bio)
+void btrfsic_check_bio(struct bio *bio)
{
- __btrfsic_submit_bio(bio);
- return submit_bio_wait(bio);
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ /*
+ * We can be called before btrfsic_mount, so there might not be a
+ * dev_state.
+ */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
+ mutex_lock(&btrfsic_mutex);
+ if (dev_state) {
+ if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
+ btrfsic_check_write_bio(bio, dev_state);
+ else if (bio->bi_opf & REQ_PREFLUSH)
+ btrfsic_check_flush_bio(bio, dev_state);
+ }
+ mutex_unlock(&btrfsic_mutex);
}
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index bcc730a06cb5..e4c8aed7996f 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-void btrfsic_submit_bio(struct bio *bio);
-int btrfsic_submit_bio_wait(struct bio *bio);
+void btrfsic_check_bio(struct bio *bio);
#else
-#define btrfsic_submit_bio submit_bio
-#define btrfsic_submit_bio_wait submit_bio_wait
+static inline void btrfsic_check_bio(struct bio *bio) { }
#endif
int btrfsic_mount(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index be476f094300..c0a08064b0a7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -15,6 +15,7 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
+#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
@@ -136,109 +137,14 @@ static int compression_decompress(int type, struct list_head *ws,
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
- unsigned long disk_size)
-{
- return sizeof(struct compressed_bio) +
- (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
-}
-
-static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
- u64 disk_start)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- const u32 csum_size = fs_info->csum_size;
- const u32 sectorsize = fs_info->sectorsize;
- struct page *page;
- unsigned int i;
- char *kaddr;
- u8 csum[BTRFS_CSUM_SIZE];
- struct compressed_bio *cb = bio->bi_private;
- u8 *cb_sum = cb->sums;
-
- if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
- return 0;
-
- shash->tfm = fs_info->csum_shash;
-
- for (i = 0; i < cb->nr_pages; i++) {
- u32 pg_offset;
- u32 bytes_left = PAGE_SIZE;
- page = cb->compressed_pages[i];
-
- /* Determine the remaining bytes inside the page first */
- if (i == cb->nr_pages - 1)
- bytes_left = cb->compressed_len - i * PAGE_SIZE;
-
- /* Hash through the page sector by sector */
- for (pg_offset = 0; pg_offset < bytes_left;
- pg_offset += sectorsize) {
- kaddr = kmap_atomic(page);
- crypto_shash_digest(shash, kaddr + pg_offset,
- sectorsize, csum);
- kunmap_atomic(kaddr);
-
- if (memcmp(&csum, cb_sum, csum_size) != 0) {
- btrfs_print_data_csum_error(inode, disk_start,
- csum, cb_sum, cb->mirror_num);
- if (btrfs_bio(bio)->device)
- btrfs_dev_stat_inc_and_print(
- btrfs_bio(bio)->device,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- return -EIO;
- }
- cb_sum += csum_size;
- disk_start += sectorsize;
- }
- }
- return 0;
-}
-
-/*
- * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
- *
- * Return true if there is no pending bio nor io.
- * Return false otherwise.
- */
-static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- unsigned int bi_size = 0;
- bool last_io = false;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * At endio time, bi_iter.bi_size doesn't represent the real bio size.
- * Thus here we have to iterate through all segments to grab correct
- * bio size.
- */
- bio_for_each_segment_all(bvec, bio, iter_all)
- bi_size += bvec->bv_len;
-
- if (bio->bi_status)
- cb->status = bio->bi_status;
-
- ASSERT(bi_size && bi_size <= cb->compressed_len);
- last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
- &cb->pending_sectors);
- /*
- * Here we must wake up the possible error handler after all other
- * operations on @cb finished, or we can race with
- * finish_compressed_bio_*() which may free @cb.
- */
- wake_up_var(cb);
-
- return last_io;
-}
-
static void finish_compressed_bio_read(struct compressed_bio *cb)
{
unsigned int index;
struct page *page;
+ if (cb->status == BLK_STS_OK)
+ cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
+
/* Release the compressed pages */
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
@@ -247,86 +153,60 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
}
/* Do io completion on the original bio */
- if (cb->status != BLK_STS_OK) {
- cb->orig_bio->bi_status = cb->status;
- bio_endio(cb->orig_bio);
- } else {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * We have verified the checksum already, set page checked so
- * the end_io handlers know about it
- */
- ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
- u64 bvec_start = page_offset(bvec->bv_page) +
- bvec->bv_offset;
-
- btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
- bvec->bv_page, bvec_start,
- bvec->bv_len);
- }
-
- bio_endio(cb->orig_bio);
- }
+ btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
/* Finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
}
-/* when we finish reading compressed pages from the disk, we
- * decompress them and then run the bio end_io routines on the
- * decompressed pages (in the inode address space).
- *
- * This allows the checksumming and other IO error handling routines
- * to work normally
- *
- * The compressed pages are freed here, and it must be run
- * in process context
+/*
+ * Verify the checksums and kick off repair if needed on the uncompressed data
+ * before decompressing it into the original bio and freeing the uncompressed
+ * pages.
*/
-static void end_compressed_bio_read(struct bio *bio)
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
- struct inode *inode;
- unsigned int mirror = btrfs_bio(bio)->mirror_num;
- int ret = 0;
-
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
-
- /*
- * Record the correct mirror_num in cb->orig_bio so that
- * read-repair can work properly.
- */
- btrfs_bio(cb->orig_bio)->mirror_num = mirror;
- cb->mirror_num = mirror;
-
- /*
- * Some IO in this cb have failed, just skip checksum as there
- * is no way it could be correct.
- */
- if (cb->status != BLK_STS_OK)
- goto csum_failed;
-
- inode = cb->inode;
- ret = check_compressed_csum(BTRFS_I(inode), bio,
- bio->bi_iter.bi_sector << 9);
- if (ret)
- goto csum_failed;
+ struct compressed_bio *cb = bbio->private;
+ struct inode *inode = cb->inode;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ blk_status_t status = bbio->bio.bi_status;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
+
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
+
+ if (!status &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset))) {
+ btrfs_clean_io_failure(bi, start, bv.bv_page,
+ bv.bv_offset);
+ } else {
+ int ret;
+
+ refcount_inc(&cb->pending_ios);
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ btrfs_submit_data_read_bio);
+ if (ret) {
+ refcount_dec(&cb->pending_ios);
+ status = errno_to_blk_status(ret);
+ }
+ }
+ }
- /* ok, we're the last bio for this extent, lets start
- * the decompression.
- */
- ret = btrfs_decompress_bio(cb);
+ if (status)
+ cb->status = status;
-csum_failed:
- if (ret)
- cb->status = errno_to_blk_status(ret);
- finish_compressed_bio_read(cb);
-out:
- bio_put(bio);
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ btrfs_bio_free_csum(bbio);
+ bio_put(&bbio->bio);
}
/*
@@ -403,6 +283,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
kfree(cb);
}
+static void btrfs_finish_compressed_write_work(struct work_struct *work)
+{
+ struct compressed_bio *cb =
+ container_of(work, struct compressed_bio, write_end_work);
+
+ finish_compressed_bio_write(cb);
+}
+
/*
* Do the cleanup once all the compressed pages hit the disk. This will clear
* writeback on the file pages and free the compressed pages.
@@ -410,32 +298,20 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio)
+static void end_compressed_bio_write(struct btrfs_bio *bbio)
{
- struct compressed_bio *cb = bio->bi_private;
+ struct compressed_bio *cb = bbio->private;
- if (!dec_and_test_compressed_bio(cb, bio))
- goto out;
+ if (bbio->bio.bi_status)
+ cb->status = bbio->bio.bi_status;
- btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+ if (refcount_dec_and_test(&cb->pending_ios)) {
+ struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
- finish_compressed_bio_write(cb);
-out:
- bio_put(bio);
-}
-
-static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
- struct compressed_bio *cb,
- struct bio *bio, int mirror_num)
-{
- blk_status_t ret;
-
- ASSERT(bio->bi_iter.bi_size);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- return ret;
+ btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
+ queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
+ }
+ bio_put(&bbio->bio);
}
/*
@@ -456,7 +332,8 @@ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
- unsigned int opf, bio_end_io_t endio_func,
+ blk_opf_t opf,
+ btrfs_bio_end_io_t endio_func,
u64 *next_stripe_start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
@@ -465,12 +342,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
struct bio *bio;
int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
-
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
- bio->bi_opf = opf;
- bio->bi_private = cb;
- bio->bi_end_io = endio_func;
em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
if (IS_ERR(em)) {
@@ -488,7 +361,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
return ERR_PTR(ret);
}
*next_stripe_start = disk_bytenr + geom.len;
-
+ refcount_inc(&cb->pending_ios);
return bio;
}
@@ -506,7 +379,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int compressed_len,
struct page **compressed_pages,
unsigned int nr_pages,
- unsigned int write_flags,
+ blk_opf_t write_flags,
struct cgroup_subsys_state *blkcg_css,
bool writeback)
{
@@ -515,28 +388,30 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct compressed_bio *cb;
u64 cur_disk_bytenr = disk_start;
u64 next_stripe_start;
- blk_status_t ret;
+ blk_status_t ret = BLK_STS_OK;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
- const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
+ const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
- refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
- cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->writeback = writeback;
- cb->orig_bio = NULL;
+ INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
cb->nr_pages = nr_pages;
+ if (blkcg_css)
+ kthread_associate_blkcg(blkcg_css);
+
while (cur_disk_bytenr < disk_start + compressed_len) {
u64 offset = cur_disk_bytenr - disk_start;
unsigned int index = offset >> PAGE_SHIFT;
@@ -552,9 +427,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
&next_stripe_start);
if (IS_ERR(bio)) {
ret = errno_to_blk_status(PTR_ERR(bio));
- bio = NULL;
- goto finish_cb;
+ break;
}
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
}
/*
* We should never reach next_stripe_start start as we will
@@ -595,41 +471,24 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (submit) {
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, true);
- if (ret)
- goto finish_cb;
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ break;
+ }
}
- ret = submit_compressed_bio(fs_info, cb, bio, 0);
- if (ret)
- goto finish_cb;
+ ASSERT(bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, bio, 0);
bio = NULL;
}
cond_resched();
}
+
if (blkcg_css)
kthread_associate_blkcg(NULL);
- return 0;
-
-finish_cb:
- if (bio) {
- bio->bi_status = ret;
- bio_endio(bio);
- }
- /* Last byte of @cb is submitted, endio will free @cb */
- if (cur_disk_bytenr == disk_start + compressed_len)
- return ret;
-
- wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
- (disk_start + compressed_len - cur_disk_bytenr) >>
- fs_info->sectorsize_bits);
- /*
- * Even with previous bio ended, we should still have io not yet
- * submitted, thus need to finish manually.
- */
- ASSERT(refcount_read(&cb->pending_sectors));
- /* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_write(cb);
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_write(cb);
return ret;
}
@@ -653,7 +512,8 @@ static u64 bio_end_offset(struct bio *bio)
*/
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
- struct compressed_bio *cb)
+ struct compressed_bio *cb,
+ unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
unsigned long end_index;
@@ -722,6 +582,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
continue;
}
+ if (PageWorkingset(page))
+ psi_memstall_enter(pflags);
+
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
@@ -730,7 +593,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
- lock_extent(tree, cur, page_end);
+ lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
@@ -744,7 +607,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -758,14 +621,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int zeros;
zeros = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, zeros);
- flush_dcache_page(page);
}
}
add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
- unlock_extent(tree, cur, page_end);
+ unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
@@ -794,15 +656,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
- unsigned int nr_pages;
- unsigned int pg_index;
struct bio *comp_bio = NULL;
const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_byte = disk_bytenr;
@@ -811,9 +671,11 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
+ /* Initialize to 1 to make skip psi_memstall_leave unless needed */
+ unsigned long pflags = 1;
blk_status_t ret;
- int faili = 0;
- u8 *sums;
+ int ret2;
+ int i;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -831,50 +693,42 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
- cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
+ cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
if (!cb) {
ret = BLK_STS_RESOURCE;
goto out;
}
- refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
+ refcount_set(&cb->pending_ios, 1);
cb->status = BLK_STS_OK;
cb->inode = inode;
- cb->mirror_num = mirror_num;
- sums = cb->sums;
cb->start = em->orig_start;
em_len = em->len;
em_start = em->start;
- free_extent_map(em);
- em = NULL;
-
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_compress_type(bio_flags);
+ cb->compress_type = em->compress_type;
cb->orig_bio = bio;
- nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_NOFS);
+ free_extent_map(em);
+ em = NULL;
+
+ cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
if (!cb->compressed_pages) {
ret = BLK_STS_RESOURCE;
- goto fail1;
+ goto fail;
}
- for (pg_index = 0; pg_index < nr_pages; pg_index++) {
- cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
- if (!cb->compressed_pages[pg_index]) {
- faili = pg_index - 1;
- ret = BLK_STS_RESOURCE;
- goto fail2;
- }
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ if (ret2) {
+ ret = BLK_STS_RESOURCE;
+ goto fail;
}
- faili = nr_pages - 1;
- cb->nr_pages = nr_pages;
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ add_ra_bio_pages(inode, em_start + em_len, cb, &pflags);
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
@@ -893,9 +747,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
REQ_OP_READ, end_compressed_bio_read,
&next_stripe_start);
if (IS_ERR(comp_bio)) {
- ret = errno_to_blk_status(PTR_ERR(comp_bio));
- comp_bio = NULL;
- goto finish_cb;
+ cb->status = errno_to_blk_status(PTR_ERR(comp_bio));
+ break;
}
}
/*
@@ -931,58 +784,51 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
submit = true;
if (submit) {
- unsigned int nr_sectors;
+ /* Save the original iter for read repair */
+ if (bio_op(comp_bio) == REQ_OP_READ)
+ btrfs_bio(comp_bio)->iter = comp_bio->bi_iter;
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
- if (ret)
- goto finish_cb;
+ /*
+ * Save the initial offset of this chunk, as there
+ * is no direct correlation between compressed pages and
+ * the original file offset. The field is only used for
+ * priting error messages.
+ */
+ btrfs_bio(comp_bio)->file_offset = file_offset;
- nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
- fs_info->sectorsize);
- sums += fs_info->csum_size * nr_sectors;
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
+ break;
+ }
- ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
- if (ret)
- goto finish_cb;
+ ASSERT(comp_bio->bi_iter.bi_size);
+ btrfs_submit_bio(fs_info, comp_bio, mirror_num);
comp_bio = NULL;
}
}
- return BLK_STS_OK;
-fail2:
- while (faili >= 0) {
- __free_page(cb->compressed_pages[faili]);
- faili--;
+ if (!pflags)
+ psi_memstall_leave(&pflags);
+
+ if (refcount_dec_and_test(&cb->pending_ios))
+ finish_compressed_bio_read(cb);
+ return;
+
+fail:
+ if (cb->compressed_pages) {
+ for (i = 0; i < cb->nr_pages; i++) {
+ if (cb->compressed_pages[i])
+ __free_page(cb->compressed_pages[i]);
+ }
}
kfree(cb->compressed_pages);
-fail1:
kfree(cb);
out:
free_extent_map(em);
- bio->bi_status = ret;
- bio_endio(bio);
- return ret;
-finish_cb:
- if (comp_bio) {
- comp_bio->bi_status = ret;
- bio_endio(comp_bio);
- }
- /* All bytes of @cb is submitted, endio will free @cb */
- if (cur_disk_byte == disk_bytenr + compressed_len)
- return ret;
-
- wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
- (disk_bytenr + compressed_len - cur_disk_byte) >>
- fs_info->sectorsize_bits);
- /*
- * Even with previous bio ended, we should still have io not yet
- * submitted, thus need to finish @cb manually.
- */
- ASSERT(refcount_read(&cb->pending_sectors));
- /* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_read(cb);
- return ret;
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
}
/*
@@ -1481,7 +1327,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
ASSERT(copy_start - decompressed < buf_len);
memcpy_to_page(bvec.bv_page, bvec.bv_offset,
buf + copy_start - decompressed, copy_len);
- flush_dcache_page(bvec.bv_page);
cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index ac5b20731d2a..1aa02903de69 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
struct compressed_bio {
- /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
- refcount_t pending_sectors;
+ /* Number of outstanding bios */
+ refcount_t pending_ios;
/* Number of compressed pages in the array */
unsigned int nr_pages;
@@ -59,16 +59,12 @@ struct compressed_bio {
/* IO errors */
blk_status_t status;
- int mirror_num;
- /* for reads, this is the bio we are copying the data into */
- struct bio *orig_bio;
-
- /*
- * the start of a variable length array of checksums only
- * used by reads
- */
- u8 sums[];
+ union {
+ /* For reads, this is the bio we are copying the data into */
+ struct bio *orig_bio;
+ struct work_struct write_end_work;
+ };
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
@@ -99,11 +95,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int compressed_len,
struct page **compressed_pages,
unsigned int nr_pages,
- unsigned int write_flags,
+ blk_opf_t write_flags,
struct cgroup_subsys_state *blkcg_css,
bool writeback);
-blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ int mirror_num);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0eecf98d0abb..b39b339fbf96 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,6 +16,7 @@
#include "volumes.h"
#include "qgroup.h"
#include "tree-mod-log.h"
+#include "tree-checker.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(buf);
ret = btrfs_set_disk_extent_flags(trans, buf,
- new_flags, level, 0);
+ new_flags, level);
if (ret)
return ret;
}
@@ -1390,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * helper function for btrfs_search_slot. The goal is to find a block
- * in cache without setting the path to blocking. If we find the block
- * we return zero and the path is unchanged.
+ * Helper function for btrfs_search_slot() and other functions that do a search
+ * on a btree. The goal is to find a tree block in the cache (the radix tree at
+ * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
+ * its pages from disk.
*
- * If we can't find the block, we set the path blocking and do some
- * reada. -EAGAIN is returned and the search must be repeated.
+ * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
+ * whole btree search, starting again from the current root node.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
@@ -1409,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct btrfs_key first_key;
int ret;
int parent_level;
+ bool unlock_up;
+ unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
+ /*
+ * If we need to read an extent buffer from disk and we are holding locks
+ * on upper level nodes, we unlock all the upper nodes before reading the
+ * extent buffer, and then return -EAGAIN to the caller as it needs to
+ * restart the search. We don't release the lock on the current level
+ * because we need to walk this node to figure out which blocks to read.
+ */
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
@@ -1436,30 +1447,45 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
+ if (p->nowait) {
+ free_extent_buffer(tmp);
+ return -EAGAIN;
+ }
+
+ if (unlock_up)
+ btrfs_unlock_up_safe(p, level + 1);
+
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EIO;
}
- *eb_ret = tmp;
- return 0;
+ if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EUCLEAN;
+ }
+
+ if (unlock_up)
+ ret = -EAGAIN;
+
+ goto out;
+ } else if (p->nowait) {
+ return -EAGAIN;
}
- /*
- * reduce lock contention at high levels
- * of the btree by dropping locks before
- * we read. Don't release the lock on the current
- * level because we need to walk this node to figure
- * out which blocks to read.
- */
- btrfs_unlock_up_safe(p, level + 1);
+ if (unlock_up) {
+ btrfs_unlock_up_safe(p, level + 1);
+ ret = -EAGAIN;
+ } else {
+ ret = 0;
+ }
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
- ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
if (IS_ERR(tmp)) {
@@ -1474,9 +1500,15 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
*/
if (!extent_buffer_uptodate(tmp))
ret = -EIO;
- free_extent_buffer(tmp);
- btrfs_release_path(p);
+out:
+ if (ret == 0) {
+ *eb_ret = tmp;
+ } else {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ }
+
return ret;
}
@@ -1609,7 +1641,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ if (p->nowait) {
+ b = btrfs_try_read_lock_root_node(root);
+ if (IS_ERR(b))
+ return b;
+ } else {
+ b = btrfs_read_lock_root_node(root);
+ }
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -1885,6 +1923,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
WARN_ON(p->nodes[0] != NULL);
BUG_ON(!cow && ins_len);
+ /*
+ * For now only allow nowait for read only operations. There's no
+ * strict reason why we can't, we just only need it for reads so it's
+ * only implemented for reads.
+ */
+ ASSERT(!p->nowait || !cow);
+
if (ins_len < 0) {
lowest_unlock = 2;
@@ -1911,7 +1956,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (p->need_commit_sem) {
ASSERT(p->search_commit_root);
- down_read(&fs_info->commit_root_sem);
+ if (p->nowait) {
+ if (!down_read_trylock(&fs_info->commit_root_sem))
+ return -EAGAIN;
+ } else {
+ down_read(&fs_info->commit_root_sem);
+ }
}
again:
@@ -2050,11 +2100,22 @@ cow_done:
if (!p->skip_locking) {
level = btrfs_header_level(b);
+
+ btrfs_maybe_reset_lockdep_class(root, b);
+
if (level <= write_lock_level) {
btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- btrfs_tree_read_lock(b);
+ if (p->nowait) {
+ if (!btrfs_try_tree_read_lock(b)) {
+ free_extent_buffer(b);
+ ret = -EAGAIN;
+ goto done;
+ }
+ } else {
+ btrfs_tree_read_lock(b);
+ }
p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
@@ -2103,6 +2164,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
+ ASSERT(!p->nowait);
if (p->search_commit_root) {
BUG_ON(time_seq);
@@ -2279,6 +2341,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
+/**
+ * Search for a valid slot for the given path.
+ *
+ * @root: The root node of the tree.
+ * @key: Will contain a valid item if found.
+ * @path: The starting point to validate the slot.
+ *
+ * Return: 0 if the item is valid
+ * 1 if not found
+ * <0 if error.
+ */
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path)
+{
+ while (1) {
+ int ret;
+ const int slot = path->slots[0];
+ const struct extent_buffer *leaf = path->nodes[0];
+
+ /* This is where we start walking the path. */
+ if (slot >= btrfs_header_nritems(leaf)) {
+ /*
+ * If we've reached the last slot in this leaf we need
+ * to go to the next leaf and reset the path.
+ */
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ return ret;
+ continue;
+ }
+ /* Store the found, valid item in @key. */
+ btrfs_item_key_to_cpu(leaf, key, slot);
+ break;
+ }
+ return 0;
+}
+
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
@@ -4367,6 +4466,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int ret = 1;
int keep_locks = path->keep_locks;
+ ASSERT(!path->nowait);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -4547,6 +4647,8 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int ret;
int i;
+ ASSERT(!path->nowait);
+
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7631b88426e..727595eee973 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -42,7 +42,6 @@ struct btrfs_delayed_ref_root;
struct btrfs_space_info;
struct btrfs_block_group;
extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
@@ -50,6 +49,11 @@ struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
struct btrfs_ioctl_encoded_io_args;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct reloc_control;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -107,14 +111,6 @@ struct btrfs_ioctl_encoded_io_args;
#define BTRFS_STAT_CURR 0
#define BTRFS_STAT_PREV 1
-/*
- * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
- */
-static inline u32 count_max_extents(u64 size)
-{
- return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-}
-
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@@ -230,6 +226,13 @@ struct btrfs_root_backup {
#define BTRFS_SUPER_INFO_SIZE 4096
/*
+ * The reserved space at the beginning of each device.
+ * It covers the primary super block and leaves space for potential use by other
+ * tools like bootloaders or to lower potential damage of accidental overwrite.
+ */
+#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
+
+/*
* the super block basically lists the main trees of the FS
* it currently lacks any block count etc etc
*/
@@ -248,8 +251,12 @@ struct btrfs_super_block {
__le64 chunk_root;
__le64 log_root;
- /* this will help find the new super based on the log root */
- __le64 log_root_transid;
+ /*
+ * This member has never been utilized since the very beginning, thus
+ * it's always 0 regardless of kernel version. We always use
+ * generation + 1 to read log tree root. So here we mark it deprecated.
+ */
+ __le64 __unused_log_root_transid;
__le64 total_bytes;
__le64 bytes_used;
__le64 root_dir_objectid;
@@ -277,14 +284,9 @@ struct btrfs_super_block {
/* the UUID written into btree blocks */
u8 metadata_uuid[BTRFS_FSID_SIZE];
- /* Extent tree v2 */
- __le64 block_group_root;
- __le64 block_group_root_generation;
- u8 block_group_root_level;
-
/* future expansion */
- u8 reserved8[7];
- __le64 reserved[25];
+ u8 reserved8[8];
+ __le64 reserved[27];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
@@ -304,7 +306,8 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
- BTRFS_FEATURE_COMPAT_RO_VERITY)
+ BTRFS_FEATURE_COMPAT_RO_VERITY | \
+ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -440,9 +443,10 @@ struct btrfs_path {
* header (ie. sizeof(struct btrfs_item) is not included).
*/
unsigned int search_for_extension:1;
+ /* Stop search if any locks need to be taken (for read) */
+ unsigned int nowait:1;
};
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
- sizeof(struct btrfs_item))
+
struct btrfs_dev_replace {
u64 replace_state; /* see #define above */
time64_t time_started; /* seconds since 1-Jan-1970 */
@@ -499,22 +503,6 @@ struct btrfs_free_cluster {
struct list_head block_group_list;
};
-enum btrfs_caching_type {
- BTRFS_CACHE_NO,
- BTRFS_CACHE_STARTED,
- BTRFS_CACHE_FAST,
- BTRFS_CACHE_FINISHED,
- BTRFS_CACHE_ERROR,
-};
-
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
- struct rb_root root;
- struct mutex lock;
-};
-
/* Discard control. */
/*
* Async discard uses multiple lists to differentiate the discard filter
@@ -546,42 +534,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
-
-/* fs_info */
-struct reloc_control;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-
-/*
- * Block group or device which contains an active swapfile. Used for preventing
- * unsafe operations while a swapfile is active.
- *
- * These are sorted on (ptr, inode) (note that a block group or device can
- * contain more than one swapfile). We compare the pointer values because we
- * don't actually care what the object is, we just need a quick check whether
- * the object exists in the rbtree.
- */
-struct btrfs_swapfile_pin {
- struct rb_node node;
- void *ptr;
- struct inode *inode;
- /*
- * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
- * points to a struct btrfs_device.
- */
- bool is_block_group;
- /*
- * Only used when 'is_block_group' is true and it is the number of
- * extents used by a swapfile for this block group ('ptr' field).
- */
- int bg_extent_count;
-};
-
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
enum {
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
@@ -635,6 +587,9 @@ enum {
/* Indicate we have half completed snapshot deletions pending. */
BTRFS_FS_UNFINISHED_DROPS,
+ /* Indicate we have to finish a zone to do next allocation. */
+ BTRFS_FS_NEED_ZONE_FINISH,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -656,6 +611,18 @@ enum btrfs_exclusive_operation {
BTRFS_EXCLOP_SWAP_ACTIVATE,
};
+/* Store data about transaction commits, exported via sysfs. */
+struct btrfs_commit_stats {
+ /* Total number of commits */
+ u64 commit_count;
+ /* The maximum commit duration so far in ns */
+ u64 max_commit_dur;
+ /* The last commit duration in ns */
+ u64 last_commit_dur;
+ /* The total commit duration in ns */
+ u64 total_commit_dur;
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -679,9 +646,8 @@ struct btrfs_fs_info {
struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
- spinlock_t block_group_cache_lock;
- u64 first_logical_byte;
- struct rb_root block_group_cache_tree;
+ rwlock_t block_group_cache_lock;
+ struct rb_root_cached block_group_cache_tree;
/* keep track of unallocated space */
atomic64_t free_chunk_space;
@@ -848,13 +814,14 @@ struct btrfs_fs_info {
* two
*/
struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
- struct btrfs_workqueue *endio_workers;
- struct btrfs_workqueue *endio_meta_workers;
- struct btrfs_workqueue *endio_raid56_workers;
- struct btrfs_workqueue *rmw_workers;
- struct btrfs_workqueue *endio_meta_write_workers;
+ struct workqueue_struct *endio_workers;
+ struct workqueue_struct *endio_meta_workers;
+ struct workqueue_struct *endio_raid56_workers;
+ struct workqueue_struct *rmw_workers;
+ struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
@@ -873,6 +840,7 @@ struct btrfs_fs_info {
struct kobject *space_info_kobj;
struct kobject *qgroups_kobj;
+ struct kobject *discard_kobj;
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
@@ -946,9 +914,9 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- struct btrfs_workqueue *scrub_workers;
- struct btrfs_workqueue *scrub_wr_completion_workers;
- struct btrfs_workqueue *scrub_parity_workers;
+ struct workqueue_struct *scrub_workers;
+ struct workqueue_struct *scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@@ -988,6 +956,7 @@ struct btrfs_fs_info {
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
+ u8 qgroup_drop_subtree_thres;
/* filesystem state */
unsigned long fs_state;
@@ -1032,6 +1001,12 @@ struct btrfs_fs_info {
u32 csums_per_leaf;
u32 stripesize;
+ /*
+ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
+ * filesystem, on zoned it depends on the device constraints.
+ */
+ u64 max_extent_size;
+
/* Block groups and devices containing active swapfiles. */
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
@@ -1045,11 +1020,10 @@ struct btrfs_fs_info {
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
* if the mode is enabled
*/
- union {
- u64 zone_size;
- u64 zoned;
- };
+ u64 zone_size;
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
u64 treelog_bg;
@@ -1060,12 +1034,33 @@ struct btrfs_fs_info {
*/
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
u64 nr_global_roots;
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
+ /* Updates are not protected by any lock */
+ struct btrfs_commit_stats commit_stats;
+
+ /*
+ * Last generation where we dropped a non-relocation root.
+ * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+ * to change it and to read it, respectively.
+ */
+ u64 last_root_drop_gen;
+
+ /*
+ * Annotations for transaction events (structures are empty when
+ * compiled without lockdep).
+ */
+ struct lockdep_map btrfs_trans_num_writers_map;
+ struct lockdep_map btrfs_trans_num_extwriters_map;
+ struct lockdep_map btrfs_state_change_map[4];
+ struct lockdep_map btrfs_trans_pending_ordered_map;
+ struct lockdep_map btrfs_ordered_extent_map;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1073,7 +1068,6 @@ struct btrfs_fs_info {
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
- struct kobject *discard_debug_kobj;
struct list_head allocated_roots;
spinlock_t eb_leak_lock;
@@ -1081,12 +1075,85 @@ struct btrfs_fs_info {
#endif
};
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+ u64 gen)
+{
+ WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+ return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
/*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+ unsigned num_items)
+{
+ return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+ sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (!fs_info)
+ return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+ return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation op);
+
+/*
* The state of btrfs root
*/
enum {
@@ -1144,8 +1211,86 @@ enum {
BTRFS_ROOT_ORPHAN_CLEANUP,
/* This root has a drop operation that was started previously. */
BTRFS_ROOT_UNFINISHED_DROP,
+ /* This reloc root needs to have its buffers lockdep class reset. */
+ BTRFS_ROOT_RESET_LOCKDEP_CLASS,
};
+enum btrfs_lockdep_trans_states {
+ BTRFS_LOCKDEP_TRANS_COMMIT_START,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+ BTRFS_LOCKDEP_TRANS_COMPLETED,
+};
+
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock) \
+ do { \
+ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->lock##_map, _THIS_IP_); \
+ } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner: The struct where the lockdep map is defined
+ * @lock: The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock) \
+ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock) \
+ rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i) \
+ do { \
+ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
+ } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i) \
+ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i) \
+ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
+ } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state) \
+ do { \
+ static struct lock_class_key lock##_key; \
+ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
+ &lock##_key, 0); \
+ } while (0)
+
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
@@ -1330,6 +1475,8 @@ struct btrfs_replace_extent_info {
* existing extent into a file range.
*/
bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
/* Meaningful only if is_new_extent is true. */
int qgroup_reserved;
/*
@@ -2361,17 +2508,6 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/*
- * For extent tree v2 we overload the extent root with the block group root, as
- * we will have multiple extent roots.
- */
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
- extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
- extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
- struct btrfs_root_backup, extent_root_level, 8);
-
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
@@ -2475,8 +2611,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
chunk_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
log_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
- log_root_transid, 64);
BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
log_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
@@ -2506,13 +2640,6 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
- block_group_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
- struct btrfs_super_block,
- block_group_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
- block_group_root_level, 8);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
@@ -2733,37 +2860,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-/*
- * Take the number of bytes to be checksummmed and figure out how many leaves
- * it would require to store the csums for that many bytes.
- */
-static inline u64 btrfs_csum_bytes_to_leaves(
- const struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
-
- return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
-}
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
- unsigned num_items)
-{
- return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes);
@@ -2783,7 +2879,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr, bool strict);
+ u64 objectid, u64 offset, u64 bytenr, bool strict,
+ struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2810,8 +2907,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- struct extent_buffer *eb, u64 flags,
- int level, int is_data);
+ struct extent_buffer *eb, u64 flags, int level);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
@@ -2891,7 +2987,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes);
+ u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -3038,6 +3134,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path);
+int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ struct btrfs_path *path);
+
+/*
+ * Search in @root for a given @key, and store the slot found in @found_key.
+ *
+ * @root: The root node of the tree.
+ * @key: The key we are looking for.
+ * @found_key: Will hold the found item.
+ * @path: Holds the current slot/leaf.
+ * @iter_ret: Contains the value returned from btrfs_search_slot or
+ * btrfs_get_next_valid_item, whichever was executed last.
+ *
+ * The @iter_ret is an output variable that will contain the return value of
+ * btrfs_search_slot, if it encountered an error, or the value returned from
+ * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
+ * slot was found, 1 if there were no more leaves, and <0 if there was an error.
+ *
+ * It's recommended to use a separate variable for iter_ret and then use it to
+ * set the function return value so there's no confusion of the 0/1/errno
+ * values stemming from btrfs_search_slot.
+ */
+#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \
+ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \
+ (iter_ret) >= 0 && \
+ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
+ (path)->slots[0]++ \
+ )
+
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@@ -3189,16 +3314,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* file-item.c */
-struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid, u64 pos,
+ u64 num_bytes);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -3209,7 +3330,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit);
+ struct list_head *list, int search_commit,
+ bool nowait);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
@@ -3223,16 +3345,21 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
-blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len);
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool strict);
+ u64 *ram_bytes, bool nowait, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
@@ -3254,23 +3381,39 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- struct user_namespace *mnt_userns);
+struct btrfs_new_inode_args {
+ /* Input */
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool orphan;
+ bool subvol;
+
+ /*
+ * Output from btrfs_new_inode_prepare(), input to
+ * btrfs_create_new_inode().
+ */
+ struct posix_acl *default_acl;
+ struct posix_acl *acl;
+};
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items);
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args);
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits);
+ u32 bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, u32 bits);
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
-int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_free_inode(struct inode *inode);
@@ -3308,14 +3451,20 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type);
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
+
extern const struct dentry_operations btrfs_dentry_operations;
-extern const struct iomap_ops btrfs_dio_iomap_ops;
-extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
@@ -3327,6 +3476,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
const u64 add_bytes,
const u64 del_bytes);
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3344,15 +3494,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation type);
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
- enum btrfs_exclusive_operation op);
-
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3362,8 +3503,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned);
extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
@@ -3383,8 +3522,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes);
+ size_t *write_bytes, bool nowait);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3402,11 +3543,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
-#ifdef CONFIG_PRINTK
+#ifdef CONFIG_PRINTK_INDEX
+
+#define btrfs_printk(fs_info, fmt, args...) \
+do { \
+ printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
+ _btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
__printf(2, 3)
__cold
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
+#elif defined(CONFIG_PRINTK)
+
+#define btrfs_printk(fs_info, fmt, args...) \
+ _btrfs_printk(fs_info, fmt, ##args)
+
+__printf(2, 3)
+__cold
+void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+
#else
+
#define btrfs_printk(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
#endif
@@ -3632,7 +3791,7 @@ const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno);
+ unsigned int line, int errno, bool first_hit);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
@@ -3640,9 +3799,11 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
*/
#define btrfs_abort_transaction(trans, errno) \
do { \
+ bool first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
+ first = true; \
if ((errno) != -EIO && (errno) != -EROFS) { \
WARN(1, KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
@@ -3654,15 +3815,28 @@ do { \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (errno)); \
+ __LINE__, (errno), first); \
} while (0)
+#ifdef CONFIG_PRINTK_INDEX
+
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
-do { \
- __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
- (errno), fmt, ##args); \
+do { \
+ printk_index_subsys_emit( \
+ "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
+ KERN_CRIT, fmt); \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args); \
} while (0)
+#else
+
+#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
+ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
+ (errno), fmt, ##args)
+
+#endif
+
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state)))
#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
@@ -3815,15 +3989,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
-int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir);
+int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct posix_acl *acl, int type);
#else
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
-static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
+static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return 0;
+ return -EOPNOTSUPP;
}
#endif
@@ -3857,16 +4032,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
-static inline void btrfs_init_full_stripe_locks_tree(
- struct btrfs_full_stripe_locks_tree *locks_root)
-{
- locks_root->root = RB_ROOT;
- mutex_init(&locks_root->lock);
-}
/* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
@@ -3893,6 +4061,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
extern const struct fsverity_operations btrfs_verityops;
int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
encryption, 8);
@@ -3910,6 +4079,12 @@ static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
return 0;
}
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ return -EPERM;
+}
+
#endif
/* Sanity test specific functions */
@@ -3926,11 +4101,6 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
}
#endif
-static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
-{
- return fs_info->zoned != 0;
-}
-
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
{
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index bd8267c4687d..118b2e20b2e1 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -127,9 +127,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len)
+ struct extent_changeset **reserved, u64 start,
+ u64 len, bool noflush)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
int ret;
/* align the range */
@@ -137,7 +139,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (noflush)
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ else if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+ ret = btrfs_reserve_data_bytes(fs_info, len, flush);
if (ret < 0)
return ret;
@@ -273,7 +280,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
- u64 nr_extents = count_max_extents(num_bytes);
+ u64 nr_extents = count_max_extents(fs_info, num_bytes);
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
@@ -289,7 +296,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
- u64 disk_num_bytes)
+ u64 disk_num_bytes, bool noflush)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -308,7 +315,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
- if (btrfs_is_free_space_inode(inode)) {
+ if (noflush || btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
@@ -333,7 +340,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
*/
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
- ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+ noflush);
if (ret)
return ret;
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
@@ -349,7 +357,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* needs to free the reservation we just made.
*/
spin_lock(&inode->lock);
- nr_extents = count_max_extents(num_bytes);
+ nr_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
@@ -412,7 +420,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
unsigned num_extents;
spin_lock(&inode->lock);
- num_extents = count_max_extents(num_bytes);
+ num_extents = count_max_extents(fs_info, num_bytes);
btrfs_mod_outstanding_extents(inode, -num_extents);
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -453,10 +461,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
{
int ret;
- ret = btrfs_check_data_free_space(inode, reserved, start, len);
+ ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(inode, len, len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 28bf5c3ef430..e07d46043455 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -7,7 +7,8 @@ struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
- struct extent_changeset **reserved, u64 start, u64 len);
+ struct extent_changeset **reserved, u64 start, u64 len,
+ bool noflush);
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 748bf6b0d860..cac5169eaf8d 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node(
INIT_LIST_HEAD(&delayed_node->p_list);
}
-static inline int btrfs_is_continuous_delayed_item(
- struct btrfs_delayed_item *item1,
- struct btrfs_delayed_item *item2)
-{
- if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
- item1->key.objectid == item2->key.objectid &&
- item1->key.type == item2->key.type &&
- item1->key.offset + 1 == item2->key.offset)
- return 1;
- return 0;
-}
-
static struct btrfs_delayed_node *btrfs_get_delayed_node(
struct btrfs_inode *btrfs_inode)
{
@@ -314,15 +302,21 @@ static inline void btrfs_release_prepared_delayed_node(
__btrfs_release_delayed_node(node, 1);
}
-static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
+ struct btrfs_delayed_node *node,
+ enum btrfs_delayed_item_type type)
{
struct btrfs_delayed_item *item;
+
item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
if (item) {
item->data_len = data_len;
- item->ins_or_del = 0;
+ item->type = type;
item->bytes_reserved = 0;
- item->delayed_node = NULL;
+ item->delayed_node = node;
+ RB_CLEAR_NODE(&item->rb_node);
+ INIT_LIST_HEAD(&item->log_list);
+ item->logged = false;
refcount_set(&item->refs, 1);
}
return item;
@@ -331,89 +325,46 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
/*
* __btrfs_lookup_delayed_item - look up the delayed item by key
* @delayed_node: pointer to the delayed node
- * @key: the key to look up
- * @prev: used to store the prev item if the right item isn't found
- * @next: used to store the next item if the right item isn't found
+ * @index: the dir index value to lookup (offset of a dir index key)
*
* Note: if we don't find the right item, we will return the prev item and
* the next item.
*/
static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
- struct btrfs_key *key,
- struct btrfs_delayed_item **prev,
- struct btrfs_delayed_item **next)
+ u64 index)
{
- struct rb_node *node, *prev_node = NULL;
+ struct rb_node *node = root->rb_node;
struct btrfs_delayed_item *delayed_item = NULL;
- int ret = 0;
-
- node = root->rb_node;
while (node) {
delayed_item = rb_entry(node, struct btrfs_delayed_item,
rb_node);
- prev_node = node;
- ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
- if (ret < 0)
+ if (delayed_item->index < index)
node = node->rb_right;
- else if (ret > 0)
+ else if (delayed_item->index > index)
node = node->rb_left;
else
return delayed_item;
}
- if (prev) {
- if (!prev_node)
- *prev = NULL;
- else if (ret < 0)
- *prev = delayed_item;
- else if ((node = rb_prev(prev_node)) != NULL) {
- *prev = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *prev = NULL;
- }
-
- if (next) {
- if (!prev_node)
- *next = NULL;
- else if (ret > 0)
- *next = delayed_item;
- else if ((node = rb_next(prev_node)) != NULL) {
- *next = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- } else
- *next = NULL;
- }
return NULL;
}
-static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
- struct btrfs_delayed_node *delayed_node,
- struct btrfs_key *key)
-{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
- NULL, NULL);
-}
-
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
- struct btrfs_delayed_item *ins,
- int action)
+ struct btrfs_delayed_item *ins)
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
struct rb_root_cached *root;
struct btrfs_delayed_item *item;
- int cmp;
bool leftmost = true;
- if (action == BTRFS_DELAYED_INSERTION_ITEM)
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
- else if (action == BTRFS_DELAYED_DELETION_ITEM)
- root = &delayed_node->del_root;
else
- BUG();
+ root = &delayed_node->del_root;
+
p = &root->rb_root.rb_node;
node = &ins->rb_node;
@@ -422,11 +373,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
item = rb_entry(parent_node, struct btrfs_delayed_item,
rb_node);
- cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0) {
+ if (item->index < ins->index) {
p = &(*p)->rb_right;
leftmost = false;
- } else if (cmp > 0) {
+ } else if (item->index > ins->index) {
p = &(*p)->rb_left;
} else {
return -EEXIST;
@@ -435,33 +385,16 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
- ins->delayed_node = delayed_node;
- ins->ins_or_del = action;
- if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
- action == BTRFS_DELAYED_INSERTION_ITEM &&
- ins->key.offset >= delayed_node->index_cnt)
- delayed_node->index_cnt = ins->key.offset + 1;
+ if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->index >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->index + 1;
delayed_node->count++;
atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
return 0;
}
-static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_INSERTION_ITEM);
-}
-
-static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
- struct btrfs_delayed_item *item)
-{
- return __btrfs_add_delayed_item(node, item,
- BTRFS_DELAYED_DELETION_ITEM);
-}
-
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
@@ -477,21 +410,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
- /* Not associated with any delayed_node */
- if (!delayed_item->delayed_node)
+ /* Not inserted, ignore it. */
+ if (RB_EMPTY_NODE(&delayed_item->rb_node))
return;
+
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
BUG_ON(!delayed_root);
- BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
- delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
- if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_item->delayed_node->ins_root;
else
root = &delayed_item->delayed_node->del_root;
rb_erase_cached(&delayed_item->rb_node, root);
+ RB_CLEAR_NODE(&delayed_item->rb_node);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -546,12 +479,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
u64 num_bytes;
int ret;
@@ -571,9 +503,15 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid,
+ item->delayed_node->inode_id,
num_bytes, 1);
- item->bytes_reserved = num_bytes;
+ /*
+ * For insertions we track reserved metadata space by accounting
+ * for the number of leaves that will be used, based on the delayed
+ * node's index_items_size field.
+ */
+ if (item->type == BTRFS_DELAYED_DELETION_ITEM)
+ item->bytes_reserved = num_bytes;
}
return ret;
@@ -594,11 +532,26 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
* to release/reserve qgroup space.
*/
trace_btrfs_space_reservation(fs_info, "delayed_item",
- item->key.objectid, item->bytes_reserved,
- 0);
+ item->delayed_node->inode_id,
+ item->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
+static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
+ unsigned int num_leaves)
+{
+ struct btrfs_fs_info *fs_info = node->root->fs_info;
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
+
+ /* There are no space reservations during log replay, bail out. */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ return;
+
+ trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
+ bytes, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
+}
+
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -672,36 +625,78 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
}
/*
- * Insert a single delayed item or a batch of delayed items that have consecutive
- * keys if they exist.
+ * Insert a single delayed item or a batch of delayed items, as many as possible
+ * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
+ * in the rbtree, and if there's a gap between two consecutive dir index items,
+ * then it means at some point we had delayed dir indexes to add but they got
+ * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
+ * into the subvolume tree. Dir index keys also have their offsets coming from a
+ * monotonically increasing counter, so we can't get new keys with an offset that
+ * fits within a gap between delayed dir index items.
*/
static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_delayed_item *first_item)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_delayed_node *node = first_item->delayed_node;
LIST_HEAD(item_list);
struct btrfs_delayed_item *curr;
struct btrfs_delayed_item *next;
- const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+ const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_item_batch batch;
+ struct btrfs_key first_key;
+ const u32 first_data_size = first_item->data_len;
int total_size;
char *ins_data = NULL;
int ret;
+ bool continuous_keys_only = false;
+
+ lockdep_assert_held(&node->mutex);
+
+ /*
+ * During normal operation the delayed index offset is continuously
+ * increasing, so we can batch insert all items as there will not be any
+ * overlapping keys in the tree.
+ *
+ * The exception to this is log replay, where we may have interleaved
+ * offsets in the tree, so our batch needs to be continuous keys only in
+ * order to ensure we do not end up with out of order items in our leaf.
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ continuous_keys_only = true;
+
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(first_item->bytes_reserved == 0);
list_add_tail(&first_item->tree_list, &item_list);
- batch.total_data_size = first_item->data_len;
+ batch.total_data_size = first_data_size;
batch.nr = 1;
- total_size = first_item->data_len + sizeof(struct btrfs_item);
+ total_size = first_data_size + sizeof(struct btrfs_item);
curr = first_item;
while (true) {
int next_size;
next = __btrfs_next_delayed_item(curr);
- if (!next || !btrfs_is_continuous_delayed_item(curr, next))
+ if (!next)
break;
+ /*
+ * We cannot allow gaps in the key space if we're doing log
+ * replay.
+ */
+ if (continuous_keys_only && (next->index != curr->index + 1))
+ break;
+
+ ASSERT(next->bytes_reserved == 0);
+
next_size = next->data_len + sizeof(struct btrfs_item);
if (total_size + next_size > max_size)
break;
@@ -714,8 +709,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
}
if (batch.nr == 1) {
- batch.keys = &first_item->key;
- batch.data_sizes = &first_item->data_len;
+ first_key.objectid = node->inode_id;
+ first_key.type = BTRFS_DIR_INDEX_KEY;
+ first_key.offset = first_item->index;
+ batch.keys = &first_key;
+ batch.data_sizes = &first_data_size;
} else {
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -732,7 +730,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
batch.keys = ins_keys;
batch.data_sizes = ins_sizes;
list_for_each_entry(curr, &item_list, tree_list) {
- ins_keys[i] = curr->key;
+ ins_keys[i].objectid = node->inode_id;
+ ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[i].offset = curr->index;
ins_sizes[i] = curr->data_len;
i++;
}
@@ -758,9 +758,41 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
*/
btrfs_release_path(path);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * For normal operations we will batch an entire leaf's worth of delayed
+ * items, so if there are more items to process we can decrement
+ * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
+ *
+ * However for log replay we may not have inserted an entire leaf's
+ * worth of items, we may have not had continuous items, so decrementing
+ * here would mess up the index_item_leaves accounting. For this case
+ * only clean up the accounting when there are no items left.
+ */
+ if (next && !continuous_keys_only) {
+ /*
+ * We inserted one batch of items into a leaf a there are more
+ * items to flush in a future batch, now release one unit of
+ * metadata space from the delayed block reserve, corresponding
+ * the leaf we just flushed to.
+ */
+ btrfs_delayed_item_release_leaves(node, 1);
+ node->index_item_leaves--;
+ } else if (!next) {
+ /*
+ * There are no more items to insert. We can have a number of
+ * reserved leaves > 1 here - this happens when many dir index
+ * items are added and then removed before they are flushed (file
+ * names with a very short life, never span a transaction). So
+ * release all remaining leaves.
+ */
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
list_for_each_entry_safe(curr, next, &item_list, tree_list) {
list_del(&curr->tree_list);
- btrfs_delayed_item_release_metadata(root, curr);
btrfs_release_delayed_item(curr);
}
out:
@@ -796,62 +828,77 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
+ const u64 ino = item->delayed_node->inode_id;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- struct list_head head;
- int nitems, i, last_item;
- int ret = 0;
+ struct extent_buffer *leaf = path->nodes[0];
+ LIST_HEAD(batch_list);
+ int nitems, slot, last_slot;
+ int ret;
+ u64 total_reserved_size = item->bytes_reserved;
- BUG_ON(!path->nodes[0]);
+ ASSERT(leaf != NULL);
- leaf = path->nodes[0];
+ slot = path->slots[0];
+ last_slot = btrfs_header_nritems(leaf) - 1;
+ /*
+ * Our caller always gives us a path pointing to an existing item, so
+ * this can not happen.
+ */
+ ASSERT(slot <= last_slot);
+ if (WARN_ON(slot > last_slot))
+ return -ENOENT;
- i = path->slots[0];
- last_item = btrfs_header_nritems(leaf) - 1;
- if (i > last_item)
- return -ENOENT; /* FIXME: Is errno suitable? */
+ nitems = 1;
+ curr = item;
+ list_add_tail(&curr->tree_list, &batch_list);
- next = item;
- INIT_LIST_HEAD(&head);
- btrfs_item_key_to_cpu(leaf, &key, i);
- nitems = 0;
/*
- * count the number of the dir index items that we can delete in batch
+ * Keep checking if the next delayed item matches the next item in the
+ * leaf - if so, we can add it to the batch of items to delete from the
+ * leaf.
*/
- while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
- list_add_tail(&next->tree_list, &head);
- nitems++;
+ while (slot < last_slot) {
+ struct btrfs_key key;
- curr = next;
next = __btrfs_next_delayed_item(curr);
if (!next)
break;
- if (!btrfs_is_continuous_delayed_item(curr, next))
- break;
-
- i++;
- if (i > last_item)
+ slot++;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
break;
- btrfs_item_key_to_cpu(leaf, &key, i);
+ nitems++;
+ curr = next;
+ list_add_tail(&curr->tree_list, &batch_list);
+ total_reserved_size += curr->bytes_reserved;
}
- if (!nitems)
- return 0;
-
ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
if (ret)
- goto out;
+ return ret;
- list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(root, curr);
+ /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
+ if (total_reserved_size > 0) {
+ /*
+ * Check btrfs_delayed_item_reserve_metadata() to see why we
+ * don't need to release/reserve qgroup space.
+ */
+ trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
+ total_reserved_size, 0);
+ btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
+ total_reserved_size, NULL);
+ }
+
+ list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
-out:
- return ret;
+ return 0;
}
static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
@@ -859,43 +906,57 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_delayed_node *node)
{
- struct btrfs_delayed_item *curr, *prev;
+ struct btrfs_key key;
int ret = 0;
-do_again:
- mutex_lock(&node->mutex);
- curr = __btrfs_first_delayed_deletion_item(node);
- if (!curr)
- goto delete_fail;
+ key.objectid = node->inode_id;
+ key.type = BTRFS_DIR_INDEX_KEY;
+
+ while (ret == 0) {
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_deletion_item(node);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ break;
+ }
+
+ key.offset = item->index;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret > 0) {
+ /*
+ * There's no matching item in the leaf. This means we
+ * have already deleted this item in a past run of the
+ * delayed items. We ignore errors when running delayed
+ * items from an async context, through a work queue job
+ * running btrfs_async_run_delayed_root(), and don't
+ * release delayed items that failed to complete. This
+ * is because we will retry later, and at transaction
+ * commit time we always run delayed items and will
+ * then deal with errors if they fail to run again.
+ *
+ * So just release delayed items for which we can't find
+ * an item in the tree, and move to the next item.
+ */
+ btrfs_release_path(path);
+ btrfs_release_delayed_item(item);
+ ret = 0;
+ } else if (ret == 0) {
+ ret = btrfs_batch_delete_items(trans, root, path, item);
+ btrfs_release_path(path);
+ }
- ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- if (ret < 0)
- goto delete_fail;
- else if (ret > 0) {
/*
- * can't find the item which the node points to, so this node
- * is invalid, just drop it.
+ * We unlock and relock on each iteration, this is to prevent
+ * blocking other tasks for too long while we are being run from
+ * the async context (work queue job). Those tasks are typically
+ * running system calls like creat/mkdir/rename/unlink/etc which
+ * need to add delayed items to this delayed node.
*/
- prev = curr;
- curr = __btrfs_next_delayed_item(prev);
- btrfs_release_delayed_item(prev);
- ret = 0;
- btrfs_release_path(path);
- if (curr) {
- mutex_unlock(&node->mutex);
- goto do_again;
- } else
- goto delete_fail;
+ mutex_unlock(&node->mutex);
}
- btrfs_batch_delete_items(trans, root, path, curr);
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
- goto do_again;
-
-delete_fail:
- btrfs_release_path(path);
- mutex_unlock(&node->mutex);
return ret;
}
@@ -1354,24 +1415,28 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
struct btrfs_disk_key *disk_key, u8 type,
u64 index)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *delayed_item;
struct btrfs_dir_item *dir_item;
+ bool reserve_leaf_space;
+ u32 data_len;
int ret;
delayed_node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
- delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
+ delayed_node,
+ BTRFS_DELAYED_INSERTION_ITEM);
if (!delayed_item) {
ret = -ENOMEM;
goto release_node;
}
- delayed_item->key.objectid = btrfs_ino(dir);
- delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
- delayed_item->key.offset = index;
+ delayed_item->index = index;
dir_item = (struct btrfs_dir_item *)delayed_item->data;
dir_item->location = *disk_key;
@@ -1381,15 +1446,51 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
+ data_len = delayed_item->data_len + sizeof(struct btrfs_item);
mutex_lock(&delayed_node->mutex);
- ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+
+ if (delayed_node->index_item_leaves == 0 ||
+ delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
+ delayed_node->curr_index_batch_size = data_len;
+ reserve_leaf_space = true;
+ } else {
+ delayed_node->curr_index_batch_size += data_len;
+ reserve_leaf_space = false;
+ }
+
+ if (reserve_leaf_space) {
+ ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
+ /*
+ * Space was reserved for a dir index item insertion when we
+ * started the transaction, so getting a failure here should be
+ * impossible.
+ */
+ if (WARN_ON(ret)) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_item(delayed_item);
+ goto release_node;
+ }
+
+ delayed_node->index_item_leaves++;
+ } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
+ const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+
+ /*
+ * Adding the new dir index item does not require touching another
+ * leaf, so we can release 1 unit of metadata that was previously
+ * reserved when starting the transaction. This applies only to
+ * the case where we had a transaction start and excludes the
+ * transaction join case (when replaying log trees).
+ */
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, bytes, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
+ ASSERT(trans->bytes_reserved >= bytes);
+ trans->bytes_reserved -= bytes;
+ }
+
+ ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1406,19 +1507,48 @@ release_node:
static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_node *node,
- struct btrfs_key *key)
+ u64 index)
{
struct btrfs_delayed_item *item;
mutex_lock(&node->mutex);
- item = __btrfs_lookup_delayed_insertion_item(node, key);
+ item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
return 1;
}
- btrfs_delayed_item_release_metadata(node->root, item);
+ /*
+ * For delayed items to insert, we track reserved metadata bytes based
+ * on the number of leaves that we will use.
+ * See btrfs_insert_delayed_dir_index() and
+ * btrfs_delayed_item_reserve_metadata()).
+ */
+ ASSERT(item->bytes_reserved == 0);
+ ASSERT(node->index_item_leaves > 0);
+
+ /*
+ * If there's only one leaf reserved, we can decrement this item from the
+ * current batch, otherwise we can not because we don't know which leaf
+ * it belongs to. With the current limit on delayed items, we rarely
+ * accumulate enough dir index items to fill more than one leaf (even
+ * when using a leaf size of 4K).
+ */
+ if (node->index_item_leaves == 1) {
+ const u32 data_len = item->data_len + sizeof(struct btrfs_item);
+
+ ASSERT(node->curr_index_batch_size >= data_len);
+ node->curr_index_batch_size -= data_len;
+ }
+
btrfs_release_delayed_item(item);
+
+ /* If we now have no more dir index items, we can release all leaves. */
+ if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
+ btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
+ node->index_item_leaves = 0;
+ }
+
mutex_unlock(&node->mutex);
return 0;
}
@@ -1428,31 +1558,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_node *node;
struct btrfs_delayed_item *item;
- struct btrfs_key item_key;
int ret;
node = btrfs_get_or_create_delayed_node(dir);
if (IS_ERR(node))
return PTR_ERR(node);
- item_key.objectid = btrfs_ino(dir);
- item_key.type = BTRFS_DIR_INDEX_KEY;
- item_key.offset = index;
-
- ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
- &item_key);
+ ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
if (!ret)
goto end;
- item = btrfs_alloc_delayed_item(0);
+ item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
ret = -ENOMEM;
goto end;
}
- item->key = item_key;
+ item->index = index;
- ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1465,7 +1589,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
}
mutex_lock(&node->mutex);
- ret = __btrfs_add_delayed_deletion_item(node, item);
+ ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
@@ -1581,9 +1705,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int ret = 0;
list_for_each_entry(curr, del_list, readdir_list) {
- if (curr->key.offset > index)
+ if (curr->index > index)
break;
- if (curr->key.offset == index) {
+ if (curr->index == index) {
ret = 1;
break;
}
@@ -1617,13 +1741,13 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
list_del(&curr->readdir_list);
- if (curr->key.offset < ctx->pos) {
+ if (curr->index < ctx->pos) {
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
continue;
}
- ctx->pos = curr->key.offset;
+ ctx->pos = curr->index;
di = (struct btrfs_dir_item *)curr->data;
name = (char *)(di + 1);
@@ -1833,12 +1957,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
}
+ if (delayed_node->index_item_leaves > 0) {
+ btrfs_delayed_item_release_leaves(delayed_node,
+ delayed_node->index_item_leaves);
+ delayed_node->index_item_leaves = 0;
+ }
+
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
btrfs_delayed_item_release_metadata(root, curr_item);
@@ -1918,3 +2047,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
}
}
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_first_delayed_insertion_item(node);
+ while (item) {
+ /*
+ * It's possible that the item is already in a log list. This
+ * can happen in case two tasks are trying to log the same
+ * directory. For example if we have tasks A and task B:
+ *
+ * Task A collected the delayed items into a log list while
+ * under the inode's log_mutex (at btrfs_log_inode()), but it
+ * only releases the items after logging the inodes they point
+ * to (if they are new inodes), which happens after unlocking
+ * the log mutex;
+ *
+ * Task B enters btrfs_log_inode() and acquires the log_mutex
+ * of the same directory inode, before task B releases the
+ * delayed items. This can happen for example when logging some
+ * inode we need to trigger logging of its parent directory, so
+ * logging two files that have the same parent directory can
+ * lead to this.
+ *
+ * If this happens, just ignore delayed items already in a log
+ * list. All the tasks logging the directory are under a log
+ * transaction and whichever finishes first can not sync the log
+ * before the other completes and leaves the log transaction.
+ */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, ins_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(node);
+ while (item) {
+ /* It may be non-empty, for the same reason mentioned above. */
+ if (!item->logged && list_empty(&item->log_list)) {
+ refcount_inc(&item->refs);
+ list_add_tail(&item->log_list, del_list);
+ }
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
+
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_delayed_item *next;
+
+ node = btrfs_get_delayed_node(inode);
+ if (!node)
+ return;
+
+ mutex_lock(&node->mutex);
+
+ list_for_each_entry_safe(item, next, ins_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ list_for_each_entry_safe(item, next, del_list, log_list) {
+ item->logged = true;
+ list_del_init(&item->log_list);
+ if (refcount_dec_and_test(&item->refs))
+ kfree(item);
+ }
+
+ mutex_unlock(&node->mutex);
+
+ /*
+ * We are called during inode logging, which means the inode is in use
+ * and can not be evicted before we finish logging the inode. So we never
+ * have the last reference on the delayed inode.
+ * Also, we don't use btrfs_release_delayed_node() because that would
+ * requeue the delayed inode (change its order in the list of prepared
+ * nodes) and we don't want to do such change because we don't create or
+ * delete delayed items.
+ */
+ ASSERT(refcount_read(&node->refs) > 1);
+ refcount_dec(&node->refs);
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index b2412160c5bc..0163ca637a96 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -16,9 +16,10 @@
#include <linux/refcount.h>
#include "ctree.h"
-/* types of the delayed item */
-#define BTRFS_DELAYED_INSERTION_ITEM 1
-#define BTRFS_DELAYED_DELETION_ITEM 2
+enum btrfs_delayed_item_type {
+ BTRFS_DELAYED_INSERTION_ITEM,
+ BTRFS_DELAYED_DELETION_ITEM
+};
struct btrfs_delayed_root {
spinlock_t lock;
@@ -58,18 +59,42 @@ struct btrfs_delayed_node {
u64 index_cnt;
unsigned long flags;
int count;
+ /*
+ * The size of the next batch of dir index items to insert (if this
+ * node is from a directory inode). Protected by @mutex.
+ */
+ u32 curr_index_batch_size;
+ /*
+ * Number of leaves reserved for inserting dir index items (if this
+ * node belongs to a directory inode). This may be larger then the
+ * actual number of leaves we end up using. Protected by @mutex.
+ */
+ u32 index_item_leaves;
};
struct btrfs_delayed_item {
struct rb_node rb_node;
- struct btrfs_key key;
+ /* Offset value of the corresponding dir index key. */
+ u64 index;
struct list_head tree_list; /* used for batch insert/delete items */
struct list_head readdir_list; /* used for readdir items */
+ /*
+ * Used when logging a directory.
+ * Insertions and deletions to this list are protected by the parent
+ * delayed node's mutex.
+ */
+ struct list_head log_list;
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
refcount_t refs;
- int ins_or_del;
- u32 data_len;
+ enum btrfs_delayed_item_type type:8;
+ /*
+ * Track if this delayed item was already logged.
+ * Protected by the mutex of the parent delayed inode.
+ */
+ bool logged;
+ /* The maximum leaf size is 64K, so u16 is more than enough. */
+ u16 data_len;
char data[];
};
@@ -133,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
+/* Used during directory logging. */
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+ struct list_head *ins_list,
+ struct list_head *del_list);
+
/* for init */
int __init btrfs_delayed_inode_init(void);
void __cold btrfs_delayed_inode_exit(void);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 4176df149d04..36a3debe9493 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
- delayed_rsv->full = 0;
+ delayed_rsv->full = false;
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
}
@@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
if (num_bytes)
delayed_refs_rsv->reserved += num_bytes;
if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = 1;
+ delayed_refs_rsv->full = true;
spin_unlock(&delayed_refs_rsv->lock);
if (num_bytes)
@@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
- BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
@@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
- false);
+ BTRFS_UPDATE_DELAYED_HEAD, false, false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 91a3aabad150..d6304b690ec4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op {
u8 level;
bool update_key;
bool update_flags;
- bool is_data;
u64 flags_to_set;
};
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 71fd99b48283..61e58066b5fd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -165,7 +165,7 @@ no_valid_dev_replace_entry_found:
*/
if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
- "replace devid present without an active replace item");
+"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
@@ -474,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
+ int iter_ret = 0;
int ret = 0;
u64 chunk_offset;
@@ -524,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- if (path->slots[0] >=
- btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto free_path;
- if (ret > 0) {
- ret = 0;
- goto free_path;
- }
- } else {
- ret = 0;
- }
- }
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != src_dev->devid)
break;
@@ -557,30 +537,20 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (found_key.offset < key.offset)
break;
- dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+ dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
- goto skip;
-
- spin_lock(&cache->lock);
- cache->to_copy = 1;
- spin_unlock(&cache->lock);
+ continue;
+ set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
-
-skip:
- ret = btrfs_next_item(root, path);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- break;
- }
}
+ if (iter_ret < 0)
+ ret = iter_ret;
-free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);
@@ -604,7 +574,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
return true;
spin_lock(&cache->lock);
- if (cache->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
return true;
}
@@ -614,7 +584,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
ASSERT(!IS_ERR(em));
map = em->map_lookup;
- num_extents = cur_extent = 0;
+ num_extents = 0;
+ cur_extent = 0;
for (i = 0; i < map->num_stripes; i++) {
/* We have more device extent to copy */
if (srcdev != map->stripes[i].dev)
@@ -636,9 +607,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
/* Last stripe on this device */
- spin_lock(&cache->lock);
- cache->to_copy = 0;
- spin_unlock(&cache->lock);
+ clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
return true;
}
@@ -734,7 +703,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* Commit dev_replace state and reserve 1 item for it. */
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -876,6 +850,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
@@ -925,12 +900,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
@@ -967,7 +942,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
@@ -996,8 +971,8 @@ error:
btrfs_assign_next_active_device(src_device, tgt_device);
- list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
- fs_info->fs_devices->rw_devices++;
+ list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
+ fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -1020,7 +995,7 @@ error:
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&fs_devices->device_list_mutex);
/* replace the sysfs entry */
btrfs_sysfs_remove_device(src_device);
@@ -1149,8 +1124,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -1309,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
-{
- percpu_counter_inc(&fs_info->dev_replace.bio_counter);
-}
-
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 3911049a5f23..6084b313056a 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -7,6 +7,10 @@
#define BTRFS_DEV_REPLACE_H
struct btrfs_ioctl_dev_replace_args;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+struct btrfs_dev_replace;
+struct btrfs_block_group;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 3b532bab0755..72fb2c518a2b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
- u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ERR_PTR(ret);
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
-
- while (1) {
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_for_each_slot(root, &key, &key, path, ret) {
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
@@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
name, name_len);
if (di)
return di;
-
- path->slots[0]++;
}
- return NULL;
+ /* Adjust return code if the key was not found in the next leaf. */
+ if (ret > 0)
+ ret = 0;
+
+ return ERR_PTR(ret);
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b30309f187cf..a2da9313c694 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -51,7 +51,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
@@ -64,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
-/*
- * btrfs_end_io_wq structs are used to do processing in task context when an IO
- * is complete. This is used during reads to verify checksums, and it is used
- * by writes to insert metadata for new file extents after IO is complete.
- */
-struct btrfs_end_io_wq {
- struct bio *bio;
- bio_end_io_t *end_io;
- void *private;
- struct btrfs_fs_info *info;
- blk_status_t status;
- enum btrfs_wq_endio_type metadata;
- struct btrfs_work work;
-};
-
-static struct kmem_cache *btrfs_end_io_wq_cache;
-
-int __init btrfs_end_io_wq_init(void)
-{
- btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
- sizeof(struct btrfs_end_io_wq),
- 0,
- SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_end_io_wq_cache)
- return -ENOMEM;
- return 0;
-}
-
-void __cold btrfs_end_io_wq_exit(void)
-{
- kmem_cache_destroy(btrfs_end_io_wq_cache);
-}
-
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
if (fs_info->csum_shash)
@@ -122,88 +87,6 @@ struct async_submit_bio {
};
/*
- * Lockdep class keys for extent_buffer->lock's in this root. For a given
- * eb, the lockdep key is determined by the btrfs_root it belongs to and
- * the level the eb occupies in the tree.
- *
- * Different roots are used for different purposes and may nest inside each
- * other and they require separate keysets. As lockdep keys should be
- * static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->root_key.objectid. This ensures that all special purpose
- * roots have separate keysets.
- *
- * Lock-nesting across peer nodes is always done with the immediate parent
- * node locked thus preventing deadlock. As lockdep doesn't know this, use
- * subclass to avoid triggering lockdep warning in such cases.
- *
- * The key is set by the readpage_end_io_hook after the buffer has passed
- * csum validation but before the pages are unlocked. It is also set by
- * btrfs_init_new_buffer on freshly allocated blocks.
- *
- * We also add a check to make sure the highest level of the tree is the
- * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
- * needs update as well.
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# if BTRFS_MAX_LEVEL != 8
-# error
-# endif
-
-#define DEFINE_LEVEL(stem, level) \
- .names[level] = "btrfs-" stem "-0" #level,
-
-#define DEFINE_NAME(stem) \
- DEFINE_LEVEL(stem, 0) \
- DEFINE_LEVEL(stem, 1) \
- DEFINE_LEVEL(stem, 2) \
- DEFINE_LEVEL(stem, 3) \
- DEFINE_LEVEL(stem, 4) \
- DEFINE_LEVEL(stem, 5) \
- DEFINE_LEVEL(stem, 6) \
- DEFINE_LEVEL(stem, 7)
-
-static struct btrfs_lockdep_keyset {
- u64 id; /* root objectid */
- /* Longest entry: btrfs-free-space-00 */
- char names[BTRFS_MAX_LEVEL][20];
- struct lock_class_key keys[BTRFS_MAX_LEVEL];
-} btrfs_lockdep_keysets[] = {
- { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
- { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
- { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
- { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
- { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
- { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
- { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
- { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
- { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
- { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
- { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
- { .id = 0, DEFINE_NAME("tree") },
-};
-
-#undef DEFINE_LEVEL
-#undef DEFINE_NAME
-
-void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
- int level)
-{
- struct btrfs_lockdep_keyset *ks;
-
- BUG_ON(level >= ARRAY_SIZE(ks->keys));
-
- /* find the matching keyset, id 0 is the default entry */
- for (ks = btrfs_lockdep_keysets; ks->id; ks++)
- if (ks->id == objectid)
- break;
-
- lockdep_set_class_and_name(&eb->lock,
- &ks->keys[level], ks->names[level]);
-}
-
-#endif
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -248,22 +131,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
}
btrfs_err_rl(eb->fs_info,
- "parent transid verify failed on %llu wanted %llu found %llu",
- eb->start,
+"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
+ eb->start, eb->read_mirror,
parent_transid, btrfs_header_generation(eb));
ret = 1;
clear_extent_buffer_uptodate(eb);
out:
- unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
- &cached_state);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ &cached_state);
return ret;
}
@@ -374,9 +256,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
-static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+int btrfs_read_extent_buffer(struct extent_buffer *eb,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
@@ -519,7 +401,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
u64 found_start;
struct extent_buffer *eb;
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (fs_info->nodesize < PAGE_SIZE)
return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
@@ -587,21 +469,23 @@ static int validate_extent_buffer(struct extent_buffer *eb)
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
- eb->start, found_start);
+ btrfs_err_rl(fs_info,
+ "bad tree block start, mirror %u want %llu have %llu",
+ eb->read_mirror, eb->start, found_start);
ret = -EIO;
goto out;
}
if (check_tree_block_fsid(eb)) {
- btrfs_err_rl(fs_info, "bad fsid on block %llu",
- eb->start);
+ btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_err(fs_info, "bad tree block level %d on %llu",
- (int)btrfs_header_level(eb), eb->start);
+ btrfs_err(fs_info,
+ "bad tree block level, mirror %u level %d on logical %llu",
+ eb->read_mirror, btrfs_header_level(eb), eb->start);
ret = -EIO;
goto out;
}
@@ -612,8 +496,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
- "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
- eb->start,
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start, eb->read_mirror,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
@@ -638,8 +522,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
set_extent_buffer_uptodate(eb);
else
btrfs_err(fs_info,
- "block=%llu read time tree block corruption detected",
- eb->start);
+ "read time tree block corruption detected on logical %llu mirror %u",
+ eb->start, eb->read_mirror);
out:
return ret;
}
@@ -704,7 +588,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror);
eb = (struct extent_buffer *)page->private;
@@ -740,58 +624,6 @@ err:
return ret;
}
-static void end_workqueue_bio(struct bio *bio)
-{
- struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
- struct btrfs_fs_info *fs_info;
- struct btrfs_workqueue *wq;
-
- fs_info = end_io_wq->info;
- end_io_wq->status = bio->bi_status;
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- wq = fs_info->endio_meta_write_workers;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- wq = fs_info->endio_freespace_worker;
- else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else
- wq = fs_info->endio_write_workers;
- } else {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- wq = fs_info->endio_raid56_workers;
- else if (end_io_wq->metadata)
- wq = fs_info->endio_meta_workers;
- else
- wq = fs_info->endio_workers;
- }
-
- btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
- btrfs_queue_work(wq, &end_io_wq->work);
-}
-
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata)
-{
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
- if (!end_io_wq)
- return BLK_STS_RESOURCE;
-
- end_io_wq->private = bio->bi_private;
- end_io_wq->end_io = bio->bi_end_io;
- end_io_wq->info = info;
- end_io_wq->status = 0;
- end_io_wq->bio = bio;
- end_io_wq->metadata = metadata;
-
- bio->bi_private = end_io_wq;
- bio->bi_end_io = end_workqueue_bio;
- return 0;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -814,17 +646,14 @@ static void run_one_async_start(struct btrfs_work *work)
*/
static void run_one_async_done(struct btrfs_work *work)
{
- struct async_submit_bio *async;
- struct inode *inode;
- blk_status_t ret;
-
- async = container_of(work, struct async_submit_bio, work);
- inode = async->inode;
+ struct async_submit_bio *async =
+ container_of(work, struct async_submit_bio, work);
+ struct inode *inode = async->inode;
+ struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
- async->bio->bi_status = async->status;
- bio_endio(async->bio);
+ btrfs_bio_end_io(bbio, async->status);
return;
}
@@ -834,11 +663,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
- if (ret) {
- async->bio->bi_status = ret;
- bio_endio(async->bio);
- }
+ btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -849,17 +674,23 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start)
+/*
+ * Submit bio to an async queue.
+ *
+ * Retrun:
+ * - true if the work has been succesfuly submitted
+ * - false in case of error
+ */
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
- return BLK_STS_RESOURCE;
+ return false;
async->inode = inode;
async->bio = bio;
@@ -874,10 +705,10 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
async->status = 0;
if (op_is_sync(bio->bi_opf))
- btrfs_set_work_high_priority(&async->work);
-
- btrfs_queue_work(fs_info->workers, &async->work);
- return 0;
+ btrfs_queue_work(fs_info->hipri_workers, &async->work);
+ else
+ btrfs_queue_work(fs_info->workers, &async->work);
+ return true;
}
static blk_status_t btree_csum_one_bio(struct bio *bio)
@@ -903,7 +734,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
{
/*
* when we're called for a write, we're already in the async
- * submission context. Just jump into btrfs_map_bio
+ * submission context. Just jump into btrfs_submit_bio.
*/
return btree_csum_one_bio(bio);
}
@@ -920,69 +751,59 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
-blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
+ bio->bi_opf |= REQ_META;
+
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- /*
- * called for a read, do the setup so that checksum validation
- * can happen in the async kernel threads
- */
- ret = btrfs_bio_wq_end_io(fs_info, bio,
- BTRFS_WQ_ENDIO_METADATA);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
- ret = btree_csum_one_bio(bio);
- if (ret)
- goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else {
- /*
- * kthread helpers are used to submit writes so that
- * checksumming can happen in parallel across all CPUs
- */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
- 0, btree_submit_bio_start);
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return;
}
- if (ret)
- goto out_w_error;
- return 0;
+ /*
+ * Kthread helpers are used to submit writes so that checksumming can
+ * happen in parallel across all CPUs.
+ */
+ if (should_async_write(fs_info, BTRFS_I(inode)) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
+ return;
-out_w_error:
- bio->bi_status = ret;
- bio_endio(bio);
- return ret;
+ ret = btree_csum_one_bio(bio);
+ if (ret) {
+ btrfs_bio_end_io(bbio, ret);
+ return;
+ }
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
#ifdef CONFIG_MIGRATION
-static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page,
- enum migrate_mode mode)
+static int btree_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
* we haven't done the locking hook
*/
- if (PageDirty(page))
+ if (folio_test_dirty(src))
return -EAGAIN;
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_get_private(src) &&
+ !filemap_release_folio(src, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
+#else
+#define btree_migrate_folio NULL
#endif
-
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1005,12 +826,12 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- if (PageWriteback(page) || PageDirty(page))
- return 0;
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
- return try_release_extent_buffer(page);
+ return try_release_extent_buffer(&folio->page);
}
static void btree_invalidate_folio(struct folio *folio, size_t offset,
@@ -1019,7 +840,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset,
struct extent_io_tree *tree;
tree = &BTRFS_I(folio->mapping->host)->io_tree;
extent_invalidate_folio(tree, folio, offset);
- btree_releasepage(&folio->page, GFP_NOFS);
+ btree_release_folio(folio, GFP_NOFS);
if (folio_get_private(folio)) {
btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
"folio private not zero on folio %llu",
@@ -1080,12 +901,10 @@ static bool btree_dirty_folio(struct address_space *mapping,
static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
- .releasepage = btree_releasepage,
+ .release_folio = btree_release_folio,
.invalidate_folio = btree_invalidate_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = btree_migratepage,
-#endif
- .dirty_folio = btree_dirty_folio,
+ .migrate_folio = btree_migrate_folio,
+ .dirty_folio = btree_dirty_folio,
};
struct extent_buffer *btrfs_find_create_tree_block(
@@ -1118,12 +937,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
+ ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
+ if (btrfs_check_eb_owner(buf, owner_root)) {
+ free_extent_buffer_stale(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
return buf;
}
@@ -1563,6 +1385,23 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = -EIO;
goto fail;
}
+
+ /*
+ * For real fs, and not log/reloc trees, root owner must
+ * match its root node owner
+ */
+ if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+ root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ root->root_key.objectid != btrfs_header_owner(root->node)) {
+ btrfs_crit(fs_info,
+"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+ root->root_key.objectid, root->node->start,
+ btrfs_header_owner(root->node),
+ root->root_key.objectid);
+ ret = -EUCLEAN;
+ goto fail;
+ }
root->commit_root = btrfs_root_node(root);
return root;
fail:
@@ -1682,6 +1521,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
if (objectid == BTRFS_UUID_TREE_OBJECTID)
return btrfs_grab_root(fs_info->uuid_root) ?
fs_info->uuid_root : ERR_PTR(-ENOENT);
+ if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+ return btrfs_grab_root(fs_info->block_group_root) ?
+ fs_info->block_group_root : ERR_PTR(-ENOENT);
if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
struct btrfs_root *root = btrfs_global_root(fs_info, &key);
@@ -1850,16 +1692,17 @@ again:
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- btrfs_put_root(root);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
+ btrfs_put_root(root);
goto again;
+ }
goto fail;
}
return root;
fail:
/*
* If our caller provided us an anonymous device, then it's his
- * responsability to free it in case we fail. So we have to set our
+ * responsibility to free it in case we fail. So we have to set our
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
@@ -1942,28 +1785,9 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
return root;
}
-/*
- * called by the kthread helper functions to finally call the bio end_io
- * functions. This is where read checksum verification actually happens
- */
-static void end_workqueue_fn(struct btrfs_work *work)
-{
- struct bio *bio;
- struct btrfs_end_io_wq *end_io_wq;
-
- end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
- bio = end_io_wq->bio;
-
- bio->bi_status = end_io_wq->status;
- bio->bi_private = end_io_wq->private;
- bio->bi_end_io = end_io_wq->end_io;
- bio_endio(bio);
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-}
-
static int cleaner_kthread(void *arg)
{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
+ struct btrfs_fs_info *fs_info = arg;
int again;
while (1) {
@@ -2156,14 +1980,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
- btrfs_set_backup_block_group_root(root_backup,
- info->block_group_root->node->start);
- btrfs_set_backup_block_group_root_gen(root_backup,
- btrfs_header_generation(info->block_group_root->node));
- btrfs_set_backup_block_group_root_level(root_backup,
- btrfs_header_level(info->block_group_root->node));
- } else {
+ if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
@@ -2265,10 +2082,16 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
- btrfs_destroy_workqueue(fs_info->endio_workers);
- btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
- btrfs_destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->endio_workers)
+ destroy_workqueue(fs_info->endio_workers);
+ if (fs_info->endio_raid56_workers)
+ destroy_workqueue(fs_info->endio_raid56_workers);
+ if (fs_info->rmw_workers)
+ destroy_workqueue(fs_info->rmw_workers);
+ if (fs_info->compressed_write_workers)
+ destroy_workqueue(fs_info->compressed_write_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@@ -2282,8 +2105,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
* the queues used for metadata I/O, since tasks from those other work
* queues can do metadata I/O operations.
*/
- btrfs_destroy_workqueue(fs_info->endio_meta_workers);
- btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ if (fs_info->endio_meta_workers)
+ destroy_workqueue(fs_info->endio_meta_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2395,6 +2218,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
struct inode *inode = fs_info->btree_inode;
+ unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ fs_info->tree_root);
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
@@ -2408,14 +2233,15 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO, inode);
- BTRFS_I(inode)->io_tree.track_uptodate = false;
+ IO_TREE_BTREE_INODE_IO, NULL);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
- memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
+ BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
+ BTRFS_I(inode)->location.type = 0;
+ BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- btrfs_insert_inode_hash(inode);
+ __insert_inode_hash(inode, hash);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2434,6 +2260,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2443,7 +2270,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
- btrfs_alloc_workqueue(fs_info, "worker",
+ btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+ fs_info->hipri_workers =
+ btrfs_alloc_workqueue(fs_info, "worker-high",
flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
@@ -2460,26 +2289,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
fs_info->endio_workers =
- btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
+ alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
- max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
- max_active, 2);
+ alloc_workqueue("btrfs-endio-meta", flags, max_active);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
- max_active, 4);
- fs_info->rmw_workers =
- btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
+ alloc_workqueue("btrfs-endio-raid56", flags, max_active);
+ fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
+ fs_info->compressed_write_workers =
+ alloc_workqueue("btrfs-compressed-write", flags, max_active);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
@@ -2491,10 +2312,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
- if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->flush_workers &&
+ if (!(fs_info->workers && fs_info->hipri_workers &&
+ fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
+ fs_info->compressed_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
@@ -2521,6 +2342,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(csum_shash));
return 0;
}
@@ -2700,10 +2524,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (ret)
return ret;
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+ location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->block_group_root = root;
+ }
+ }
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
@@ -2771,8 +2609,8 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-static int validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
@@ -2814,12 +2652,14 @@ static int validate_super(struct btrfs_fs_info *fs_info,
}
/*
- * For 4K page size, we only support 4K sector size.
- * For 64K page size, we support 64K and 4K sector sizes.
+ * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ *
+ * We can support 16K sectorsize with 64K page size without problem,
+ * but such sectorsize/pagesize combination doesn't make much sense.
+ * 4K will be our future standard, PAGE_SIZE is supported from the very
+ * beginning.
*/
- if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
- (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
- sectorsize != SZ_64K))) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2872,6 +2712,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ /*
+ * Artificial requirement for block-group-tree to force newer features
+ * (free-space-tree, no-holes) so the test matrix is smaller.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+ btrfs_err(fs_info,
+ "block-group-tree feature requires fres-space-tree and no-holes");
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2954,7 +2806,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
*/
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
- return validate_super(fs_info, fs_info->super_copy, 0);
+ return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
}
/*
@@ -2968,7 +2820,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = validate_super(fs_info, sb, -1);
+ ret = btrfs_validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
@@ -3029,17 +2881,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, "couldn't read tree root");
return ret;
}
-
- if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
- return 0;
-
- bytenr = btrfs_super_block_group_root(sb);
- gen = btrfs_super_block_group_root_generation(sb);
- level = btrfs_super_block_group_root_level(sb);
- ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
- if (ret)
- btrfs_warn(fs_info, "couldn't read block group root");
- return ret;
+ return 0;
}
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
@@ -3051,16 +2893,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- struct btrfs_root *root;
-
- root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
- GFP_KERNEL);
- if (!root)
- return -ENOMEM;
- fs_info->block_group_root = root;
- }
-
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
@@ -3156,8 +2988,22 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+ btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+ BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+ BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+ BTRFS_LOCKDEP_TRANS_COMPLETED);
+
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -3207,9 +3053,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
- spin_lock_init(&fs_info->block_group_cache_lock);
- fs_info->block_group_cache_tree = RB_ROOT;
- fs_info->first_logical_byte = (u64)-1;
+ rwlock_init(&fs_info->block_group_cache_lock);
+ fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
@@ -3244,6 +3089,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
+
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
@@ -3293,7 +3140,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
static int btrfs_uuid_rescan_kthread(void *data)
{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+ struct btrfs_fs_info *fs_info = data;
int ret;
/*
@@ -3446,6 +3293,112 @@ out:
return ret;
}
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 incompat = btrfs_super_incompat_flags(disk_super);
+ const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+ const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+ if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+ btrfs_err(fs_info,
+ "cannot mount because of unknown incompat features (0x%llx)",
+ incompat);
+ return -EINVAL;
+ }
+
+ /* Runtime limitation for mixed block groups. */
+ if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (fs_info->sectorsize != fs_info->nodesize)) {
+ btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+ fs_info->nodesize, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* Mixed backref is an always-enabled feature. */
+ incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+ /* Set compression related flags just in case. */
+ if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+ else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+ incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+ /*
+ * An ancient flag, which should really be marked deprecated.
+ * Such runtime limitation doesn't really need a incompat flag.
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+ incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+ if (compat_ro_unsupp && !sb_rdonly(sb)) {
+ btrfs_err(fs_info,
+ "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata writes, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ compat_ro);
+ return -EINVAL;
+ }
+
+ /*
+ * Artificial limitations for block group tree, to force
+ * block-group-tree to rely on no-holes and free-space-tree.
+ */
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+ (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+ btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+ return -EINVAL;
+ }
+
+ /*
+ * Subpage runtime limitation on v1 cache.
+ *
+ * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+ * we're already defaulting to v2 cache, no need to bother v1 as it's
+ * going to be deprecated anyway.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ return -EINVAL;
+ }
+
+ /* This can be called by remount, we need to protect the super block. */
+ spin_lock(&fs_info->super_lock);
+ btrfs_set_super_incompat_flags(disk_super, incompat);
+ spin_unlock(&fs_info->super_lock);
+
+ return 0;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -3575,16 +3528,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
- /*
- * Flag our filesystem as having big metadata blocks if they are bigger
- * than the page size.
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
@@ -3605,68 +3548,29 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- features = btrfs_super_incompat_flags(disk_super) &
- ~BTRFS_FEATURE_INCOMPAT_SUPP;
- if (features) {
- btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
- features);
- err = -EINVAL;
- goto fail_alloc;
- }
-
- features = btrfs_super_incompat_flags(disk_super);
- features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
- if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
- else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
- features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
-
- if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- btrfs_info(fs_info, "has skinny extents");
-
- /*
- * mixed block groups end up with duplicate but slightly offset
- * extent buffers for the same range. It leads to corruptions
- */
- if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
- (sectorsize != nodesize)) {
- btrfs_err(fs_info,
-"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
- nodesize, sectorsize);
- goto fail_alloc;
- }
-
- /*
- * Needn't use the lock because there is no other task which will
- * update the flag.
- */
- btrfs_set_super_incompat_flags(disk_super, features);
-
- features = btrfs_super_compat_ro_flags(disk_super) &
- ~BTRFS_FEATURE_COMPAT_RO_SUPP;
- if (!sb_rdonly(sb) && features) {
- btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
- features);
- err = -EINVAL;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0) {
+ err = ret;
goto fail_alloc;
}
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
+ /*
+ * V1 space cache has some hardcoded PAGE_SIZE usage, and is
+ * going to be deprecated.
+ *
+ * Force to use v2 cache for subpage case.
+ */
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
+ "forcing free space tree for sector size %u with page size %lu",
+ sectorsize, PAGE_SIZE);
+
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- if (btrfs_super_incompat_flags(fs_info->super_copy) &
- BTRFS_FEATURE_INCOMPAT_RAID56) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
- err = -EINVAL;
- goto fail_alloc;
- }
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
if (!subpage_info)
goto fail_alloc;
@@ -3989,7 +3893,7 @@ static void btrfs_end_super_write(struct bio *bio)
}
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num)
+ int copy_num, bool drop_cache)
{
struct btrfs_super_block *super;
struct page *page;
@@ -4007,6 +3911,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
+
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping,
+ bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
@@ -4038,7 +3955,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i);
+ super = btrfs_read_dev_one_super(bdev, i, false);
if (IS_ERR(super))
continue;
@@ -4144,7 +4061,8 @@ static int write_dev_supers(struct btrfs_device *device,
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
errors++;
@@ -4225,6 +4143,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
*/
static void btrfs_end_empty_barrier(struct bio *bio)
{
+ bio_uninit(bio);
complete(bio->bi_private);
}
@@ -4234,7 +4153,7 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
/*
@@ -4247,17 +4166,18 @@ static void write_dev_flush(struct btrfs_device *device)
* of simplicity, since this is a debug tool and not meant for use in
* non-debug builds.
*/
- struct request_queue *q = bdev_get_queue(device->bdev);
- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ if (!bdev_write_cache(device->bdev))
return;
#endif
- bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
+ bio_init(bio, device->bdev, NULL, 0,
+ REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
bio->bi_end_io = btrfs_end_empty_barrier;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@@ -4266,7 +4186,7 @@ static void write_dev_flush(struct btrfs_device *device)
*/
static blk_status_t wait_dev_flush(struct btrfs_device *device)
{
- struct bio *bio = device->flush_bio;
+ struct bio *bio = &device->flush_bio;
if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
@@ -4626,6 +4546,28 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+
+ /*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ * We must do this before stopping the block group reclaim task, because
+ * at btrfs_relocate_block_group() we wait for this bit, and after the
+ * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
+ * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
+ * return 1.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
+ /*
+ * We may have the reclaim task running and relocating a data block group,
+ * in which case it may create delayed iputs. So stop it before we park
+ * the cleaner kthread otherwise we can get new delayed iputs after
+ * parking the cleaner, and that can make the async reclaim task to hang
+ * if it's waiting for delayed iputs to complete, since the cleaner is
+ * parked and can not run delayed iputs - this will make us hang when
+ * trying to stop the async reclaim task.
+ */
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
/*
* We don't want the cleaner to start new transactions, add more delayed
* iputs, etc. while we're closing. We can't use kthread_stop() yet
@@ -4634,12 +4576,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
- /*
- * If we had UNFINISHED_DROPS we could still be processing them, so
- * clear that bit and wake up relocation so it can stop.
- */
- btrfs_wake_unfinished_drop(fs_info);
-
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -4662,12 +4598,35 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
- cancel_work_sync(&fs_info->reclaim_bgs_work);
-
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4849,13 +4808,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key)
-{
- return btree_read_extent_buffer_pages(buf, parent_transid,
- level, first_key);
-}
-
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
/* cleanup FS via transaction */
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2e10514ecda8..c67c15d4d20b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,13 +17,6 @@
*/
#define BTRFS_BDEV_BLOCKSIZE (4096)
-enum btrfs_wq_endio_type {
- BTRFS_WQ_ENDIO_DATA,
- BTRFS_WQ_ENDIO_METADATA,
- BTRFS_WQ_ENDIO_FREE_SPACE,
- BTRFS_WQ_ENDIO_RAID56,
-};
-
static inline u64 btrfs_sb_offset(int mirror)
{
u64 start = SZ_16K;
@@ -53,10 +46,13 @@ int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+ struct btrfs_super_block *sb, int mirror_num);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num);
+ int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_key *key);
@@ -87,8 +83,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
-blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags);
+void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -111,7 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
@@ -120,14 +115,11 @@ void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
- struct btrfs_key *first_key);
-blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
- enum btrfs_wq_endio_type metadata);
-blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 dio_file_offset,
- extent_submit_bio_start_t *submit_bio_start);
+int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
+bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
@@ -147,17 +139,5 @@ int btree_lock_page_hook(struct page *page, void *data,
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);
-int __init btrfs_end_io_wq_init(void);
-void __cold btrfs_end_io_wq_exit(void);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level);
-#else
-static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
- struct extent_buffer *eb, int level)
-{
-}
-#endif
#endif
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
new file mode 100644
index 000000000000..618275af19c4
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.c
@@ -0,0 +1,1673 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include "ctree.h"
+#include "extent-io-tree.h"
+#include "btrfs_inode.h"
+#include "misc.h"
+
+static struct kmem_cache *extent_state_cache;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+ return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(states);
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline void btrfs_leak_debug_add_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_add(&state->leak_list, &states);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_state(struct extent_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&leak_lock, flags);
+ list_del(&state->leak_list);
+ spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
+ while (!list_empty(&states)) {
+ state = list_entry(states.next, struct extent_state, leak_list);
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state->start, state->end, state->state,
+ extent_state_in_tree(state),
+ refcount_read(&state->refs));
+ list_del(&state->leak_list);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+ struct extent_io_tree *tree,
+ u64 start, u64 end)
+{
+ struct inode *inode = tree->private_data;
+ u64 isize;
+
+ if (!inode)
+ return;
+
+ isize = i_size_read(inode);
+ if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
+ caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+ }
+}
+#else
+#define btrfs_leak_debug_add_state(state) do {} while (0)
+#define btrfs_leak_debug_del_state(state) do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#endif
+
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
+struct tree_entry {
+ u64 start;
+ u64 end;
+ struct rb_node rb_node;
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data)
+{
+ tree->fs_info = fs_info;
+ tree->state = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ tree->private_data = private_data;
+ tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once extent_io_tree_release is
+ * called.
+ */
+ smp_mb();
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+
+ cond_resched_lock(&tree->lock);
+ }
+ spin_unlock(&tree->lock);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+ struct extent_state *state;
+
+ /*
+ * The given mask might be not appropriate for the slab allocator,
+ * drop the unsupported bits
+ */
+ mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
+ state = kmem_cache_alloc(extent_state_cache, mask);
+ if (!state)
+ return state;
+ state->state = 0;
+ RB_CLEAR_NODE(&state->rb_node);
+ btrfs_leak_debug_add_state(state);
+ refcount_set(&state->refs, 1);
+ init_waitqueue_head(&state->wq);
+ trace_alloc_extent_state(state, mask, _RET_IP_);
+ return state;
+}
+
+static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+ if (!state)
+ return;
+ if (refcount_dec_and_test(&state->refs)) {
+ WARN_ON(extent_state_in_tree(state));
+ btrfs_leak_debug_del_state(state);
+ trace_free_extent_state(state, _RET_IP_);
+ kmem_cache_free(extent_state_cache, state);
+ }
+}
+
+static int add_extent_changeset(struct extent_state *state, u32 bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return 0;
+ if (set && (state->state & bits) == bits)
+ return 0;
+ if (!set && (state->state & bits) == 0)
+ return 0;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(&changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ return ret;
+}
+
+static inline struct extent_state *next_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_next(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+static inline struct extent_state *prev_state(struct extent_state *state)
+{
+ struct rb_node *next = rb_prev(&state->rb_node);
+
+ if (next)
+ return rb_entry(next, struct extent_state, rb_node);
+ else
+ return NULL;
+}
+
+/*
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @node_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
+ * containing @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ */
+static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***node_ret,
+ struct rb_node **parent_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *prev = NULL;
+ struct extent_state *entry = NULL;
+
+ while (*node) {
+ prev = *node;
+ entry = rb_entry(prev, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ if (node_ret)
+ *node_ret = node;
+ if (parent_ret)
+ *parent_ret = prev;
+
+ /* Search neighbors until we find the first one past the end */
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+
+ return entry;
+}
+
+/*
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
+ u64 offset,
+ struct extent_state **prev_ret,
+ struct extent_state **next_ret)
+{
+ struct rb_root *root = &tree->state;
+ struct rb_node **node = &root->rb_node;
+ struct extent_state *orig_prev;
+ struct extent_state *entry = NULL;
+
+ ASSERT(prev_ret);
+ ASSERT(next_ret);
+
+ while (*node) {
+ entry = rb_entry(*node, struct extent_state, rb_node);
+
+ if (offset < entry->start)
+ node = &(*node)->rb_left;
+ else if (offset > entry->end)
+ node = &(*node)->rb_right;
+ else
+ return entry;
+ }
+
+ orig_prev = entry;
+ while (entry && offset > entry->end)
+ entry = next_state(entry);
+ *next_ret = entry;
+ entry = orig_prev;
+
+ while (entry && offset < entry->start)
+ entry = prev_state(entry);
+ *prev_ret = entry;
+
+ return NULL;
+}
+
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+ btrfs_panic(tree->fs_info, err,
+ "locking error: extent tree was modified by another thread while locked");
+}
+
+/*
+ * Utility function to look for merge candidates inside a given range. Any
+ * extents with matching state are merged together into a single extent in the
+ * tree. Extents with EXTENT_IO in their state field are not merged because
+ * the end_io handlers need to be able to do operations on them without
+ * sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+ struct extent_state *other;
+
+ if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ return;
+
+ other = prev_state(state);
+ if (other && other->end == state->start - 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data,
+ state, other);
+ state->start = other->start;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+ other = next_state(state);
+ if (other && other->start == state->end + 1 &&
+ other->state == state->state) {
+ if (tree->private_data)
+ btrfs_merge_delalloc_extent(tree->private_data, state,
+ other);
+ state->end = other->end;
+ rb_erase(&other->rb_node, &tree->state);
+ RB_CLEAR_NODE(&other->rb_node);
+ free_extent_state(other);
+ }
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ u32 bits_to_set = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
+ state->state |= bits_to_set;
+}
+
+/*
+ * Insert an extent_state struct into the tree. 'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally. This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, struct extent_changeset *changeset)
+{
+ struct rb_node **node;
+ struct rb_node *parent;
+ const u64 end = state->end;
+
+ set_state_bits(tree, state, bits, changeset);
+
+ node = &tree->state.rb_node;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ btrfs_err(tree->fs_info,
+ "found node %llu %llu on insert of %llu %llu",
+ entry->start, entry->end, state->start, end);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+
+ merge_state(tree, state);
+ return 0;
+}
+
+/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+ struct extent_state *state, struct rb_node **node,
+ struct rb_node *parent, unsigned bits,
+ struct extent_changeset *changeset)
+{
+ set_state_bits(tree, state, bits, changeset);
+ rb_link_node(&state->rb_node, parent, node);
+ rb_insert_color(&state->rb_node, &tree->state);
+ merge_state(tree, state);
+}
+
+/*
+ * Split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half. 'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end]. After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+ struct extent_state *prealloc, u64 split)
+{
+ struct rb_node *parent = NULL;
+ struct rb_node **node;
+
+ if (tree->private_data)
+ btrfs_split_delalloc_extent(tree->private_data, orig, split);
+
+ prealloc->start = orig->start;
+ prealloc->end = split - 1;
+ prealloc->state = orig->state;
+ orig->start = split;
+
+ parent = &orig->rb_node;
+ node = &parent;
+ while (*node) {
+ struct extent_state *entry;
+
+ parent = *node;
+ entry = rb_entry(parent, struct extent_state, rb_node);
+
+ if (prealloc->end < entry->start) {
+ node = &(*node)->rb_left;
+ } else if (prealloc->end > entry->end) {
+ node = &(*node)->rb_right;
+ } else {
+ free_extent_state(prealloc);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&prealloc->rb_node, parent, node);
+ rb_insert_color(&prealloc->rb_node, &tree->state);
+
+ return 0;
+}
+
+/*
+ * Utility function to clear some bits in an extent state struct. It will
+ * optionally wake up anyone waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state,
+ u32 bits, int wake,
+ struct extent_changeset *changeset)
+{
+ struct extent_state *next;
+ u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
+ int ret;
+
+ if (tree->private_data)
+ btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
+ state->state &= ~bits_to_clear;
+ if (wake)
+ wake_up(&state->wq);
+ if (state->state == 0) {
+ next = next_state(state);
+ if (extent_state_in_tree(state)) {
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ free_extent_state(state);
+ } else {
+ WARN_ON(1);
+ }
+ } else {
+ merge_state(tree, state);
+ next = next_state(state);
+ }
+ return next;
+}
+
+/*
+ * Clear some bits on a range in the tree. This may require splitting or
+ * inserting elements in the tree, so the gfp mask is used to indicate which
+ * allocations or sleeping are allowed.
+ *
+ * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
+ * range from the tree regardless of state (ie for truncate).
+ *
+ * The range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct extent_state *prealloc = NULL;
+ u64 last_end;
+ int err;
+ int clear = 0;
+ int wake;
+ int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+
+ if (bits & EXTENT_DELALLOC)
+ bits |= EXTENT_NORESERVE;
+
+ wake = (bits & EXTENT_LOCKED) ? 1 : 0;
+ if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ clear = 1;
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && extent_state_in_tree(cached) &&
+ cached->start <= start && cached->end > start) {
+ if (clear)
+ refcount_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+
+ /* This search will find the extents that end after our range starts. */
+ state = tree_search(tree, start);
+ if (!state)
+ goto out;
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ /* The state doesn't have the wanted bits, go ahead. */
+ if (!(state->state & bits)) {
+ state = next_state(state);
+ goto next;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state | or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we clear the desired bit
+ * on it.
+ */
+
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+ goto next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * We need to split the extent, and clear the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ if (wake)
+ wake_up(&state->wq);
+
+ clear_state_bit(tree, prealloc, bits, wake, changeset);
+
+ prealloc = NULL;
+ goto out;
+ }
+
+ state = clear_state_bit(tree, state, bits, wake, changeset);
+next:
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && state && !need_resched())
+ goto hit_next;
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return 0;
+
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+ __releases(tree->lock)
+ __acquires(tree->lock)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&tree->lock);
+ schedule();
+ spin_lock(&tree->lock);
+ finish_wait(&state->wq, &wait);
+}
+
+/*
+ * Wait for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+{
+ struct extent_state *state;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+
+ spin_lock(&tree->lock);
+again:
+ while (1) {
+ /*
+ * This search will find all the extents that end after our
+ * range starts.
+ */
+ state = tree_search(tree, start);
+process_node:
+ if (!state)
+ break;
+ if (state->start > end)
+ goto out;
+
+ if (state->state & bits) {
+ start = state->start;
+ refcount_inc(&state->refs);
+ wait_on_state(tree, state);
+ free_extent_state(state);
+ goto again;
+ }
+ start = state->end + 1;
+
+ if (start > end)
+ break;
+
+ if (!cond_resched_lock(&tree->lock)) {
+ state = next_state(state);
+ goto process_node;
+ }
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ unsigned flags)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (!flags || (state->state & flags)) {
+ *cached_ptr = state;
+ refcount_inc(&state->refs);
+ }
+ }
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_LOCKED | EXTENT_BOUNDARY);
+}
+
+/*
+ * Find the first state struct with 'bits' set after 'start', and return it.
+ * tree->lock must be held. NULL will returned if nothing was found after
+ * 'start'.
+ */
+static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+ u64 start, u32 bits)
+{
+ struct extent_state *state;
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, start);
+ while (state) {
+ if (state->end >= start && (state->state & bits))
+ return state;
+ state = next_state(state);
+ }
+ return NULL;
+}
+
+/*
+ * Find the first offset in the io tree with one or more @bits set.
+ *
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->end == start - 1 && extent_state_in_tree(state)) {
+ while ((state = next_state(state)) != NULL) {
+ if (state->state & bits)
+ goto got_it;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ goto out;
+ }
+ free_extent_state(*cached_state);
+ *cached_state = NULL;
+ }
+
+ state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+ if (state) {
+ cache_state_if_flags(state, cached_state, 0);
+ *start_ret = state->start;
+ *end_ret = state->end;
+ ret = 0;
+ }
+out:
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/*
+ * Find a contiguous range of bytes in the file marked as delalloc, not more
+ * than 'max_bytes'. start and end are used to return the range,
+ *
+ * True is returned if we find something, false if nothing was in the tree.
+ */
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ bool found = false;
+ u64 total_bytes = 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ if (!state) {
+ *end = (u64)-1;
+ goto out;
+ }
+
+ while (state) {
+ if (found && (state->start != cur_start ||
+ (state->state & EXTENT_BOUNDARY))) {
+ goto out;
+ }
+ if (!(state->state & EXTENT_DELALLOC)) {
+ if (!found)
+ *end = state->end;
+ goto out;
+ }
+ if (!found) {
+ *start = state->start;
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ found = true;
+ *end = state->end;
+ cur_start = state->end + 1;
+ total_bytes += state->end - state->start + 1;
+ if (total_bytes >= max_bytes)
+ break;
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+ return found;
+}
+
+/*
+ * Set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset, gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ u32 exclusive_bits = (bits & EXTENT_LOCKED);
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
+again:
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
+ prealloc = alloc_extent_state(mask);
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, changeset);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, changeset);
+ cache_state(state, cached_state);
+ merge_state(tree, state);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ state = next_state(state);
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, changeset);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half
+ */
+ if (state->start <= end && state->end > end) {
+ if (state->state & exclusive_bits) {
+ *failed_start = start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, changeset);
+ cache_state(prealloc, cached_state);
+ merge_state(tree, prealloc);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ if (gfpflags_allow_blocking(mask))
+ cond_resched();
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state, gfp_t mask)
+{
+ return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
+ NULL, mask);
+}
+
+/*
+ * Convert all bits in a given range from one bit to another
+ *
+ * @tree: the io tree to search
+ * @start: the start offset in bytes
+ * @end: the end offset in bytes (inclusive)
+ * @bits: the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @cached_state: state that we're going to cache
+ *
+ * This will go through and set bits for the given range. If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits. This is only meant to be used by things that are mergeable, ie.
+ * converting from say DELALLOC to DIRTY. This is not meant to be used with
+ * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+ struct extent_state *prealloc = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+ bool first_iteration = true;
+
+ btrfs_debug_check_extent_io_range(tree, start, end);
+ trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+ clear_bits);
+
+again:
+ if (!prealloc) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
+ prealloc = alloc_extent_state(GFP_NOFS);
+ if (!prealloc && !first_iteration)
+ return -ENOMEM;
+ }
+
+ spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start &&
+ extent_state_in_tree(state))
+ goto hit_next;
+ }
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search_for_insert(tree, start, &p, &parent);
+ if (!state) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ prealloc->start = start;
+ prealloc->end = end;
+ insert_state_fast(tree, prealloc, p, parent, bits, NULL);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ goto out;
+ }
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going.
+ */
+ if (state->start == start && state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ goto search_again;
+ }
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ * or
+ * | ------------- state -------------- |
+ *
+ * We need to split the extent we found, and may flip bits on second
+ * half.
+ *
+ * If the extent we found extends past our range, we just split and
+ * search again. It'll get split again the next time though.
+ *
+ * If the extent we found is inside our range, we set the desired bit
+ * on it.
+ */
+ if (state->start < start) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = split_state(tree, state, prealloc, start);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ prealloc = NULL;
+ if (err)
+ goto out;
+ if (state->end <= end) {
+ set_state_bits(tree, state, bits, NULL);
+ cache_state(state, cached_state);
+ state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start < end && state && state->start == start &&
+ !need_resched())
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state | or | state |
+ *
+ * There's a hole, we need to insert something in it and ignore the
+ * extent we found.
+ */
+ if (state->start > start) {
+ u64 this_end;
+ if (end < last_start)
+ this_end = end;
+ else
+ this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with the later
+ * extent.
+ */
+ prealloc->start = start;
+ prealloc->end = this_end;
+ err = insert_state(tree, prealloc, bits, NULL);
+ if (err)
+ extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
+ start = this_end + 1;
+ goto search_again;
+ }
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * We need to split the extent, and set the bit on the first half.
+ */
+ if (state->start <= end && state->end > end) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ if (!prealloc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = split_state(tree, state, prealloc, end + 1);
+ if (err)
+ extent_io_tree_panic(tree, err);
+
+ set_state_bits(tree, prealloc, bits, NULL);
+ cache_state(prealloc, cached_state);
+ clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ prealloc = NULL;
+ goto out;
+ }
+
+search_again:
+ if (start > end)
+ goto out;
+ spin_unlock(&tree->lock);
+ cond_resched();
+ first_iteration = false;
+ goto again;
+
+out:
+ spin_unlock(&tree->lock);
+ if (prealloc)
+ free_extent_state(prealloc);
+
+ return err;
+}
+
+/*
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
+ *
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
+{
+ struct extent_state *state;
+ struct extent_state *prev = NULL, *next;
+
+ spin_lock(&tree->lock);
+
+ /* Find first extent with bits cleared */
+ while (1) {
+ state = tree_search_prev_next(tree, start, &prev, &next);
+ if (!state && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!state && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ *start_ret = prev->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!state) {
+ state = next;
+ }
+
+ /*
+ * At this point 'state' either contains 'start' or start is
+ * before 'state'
+ */
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
+ /*
+ * |--range with bits sets--|
+ * |
+ * start
+ */
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as the
+ * beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
+ }
+ } else {
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev)
+ *start_ret = prev->end + 1;
+ else
+ *start_ret = 0;
+ break;
+ }
+ }
+
+ /*
+ * Find the longest stretch from start until an entry which has the
+ * bits set
+ */
+ while (state) {
+ if (state->end >= start && !(state->state & bits)) {
+ *end_ret = state->end;
+ } else {
+ *end_ret = state->start - 1;
+ break;
+ }
+ state = next_state(state);
+ }
+out:
+ spin_unlock(&tree->lock);
+}
+
+/*
+ * Count the number of bytes in the tree that have a given bit(s) set. This
+ * can be fairly slow, except for EXTENT_DIRTY which is cached. The total
+ * number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, int contig)
+{
+ struct extent_state *state;
+ u64 cur_start = *start;
+ u64 total_bytes = 0;
+ u64 last = 0;
+ int found = 0;
+
+ if (WARN_ON(search_end <= cur_start))
+ return 0;
+
+ spin_lock(&tree->lock);
+
+ /*
+ * This search will find all the extents that end after our range
+ * starts.
+ */
+ state = tree_search(tree, cur_start);
+ while (state) {
+ if (state->start > search_end)
+ break;
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
+ total_bytes += min(search_end, state->end) + 1 -
+ max(cur_start, state->start);
+ if (total_bytes >= max_bytes)
+ break;
+ if (!found) {
+ *start = max(cur_start, state->start);
+ found = 1;
+ }
+ last = state->end;
+ } else if (contig && found) {
+ break;
+ }
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+ return total_bytes;
+}
+
+/*
+ * Searche a range in the state tree for a given mask. If 'filled' == 1, this
+ * returns 1 only if every extent in the tree has the bits set. Otherwise, 1
+ * is returned if any bit in the range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, int filled, struct extent_state *cached)
+{
+ struct extent_state *state = NULL;
+ int bitset = 0;
+
+ spin_lock(&tree->lock);
+ if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+ cached->end > start)
+ state = cached;
+ else
+ state = tree_search(tree, start);
+ while (state && start <= end) {
+ if (filled && state->start > start) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->start > end)
+ break;
+
+ if (state->state & bits) {
+ bitset = 1;
+ if (!filled)
+ break;
+ } else if (filled) {
+ bitset = 0;
+ break;
+ }
+
+ if (state->end == (u64)-1)
+ break;
+
+ start = state->end + 1;
+ if (start > end)
+ break;
+ state = next_state(state);
+ }
+
+ /* We ran out of states and were still inside of our range. */
+ if (filled && !state)
+ bitset = 0;
+ spin_unlock(&tree->lock);
+ return bitset;
+}
+
+/* Wrappers around set/clear extent bit */
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
+ GFP_NOFS);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ ASSERT(!(bits & EXTENT_LOCKED));
+
+ return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
+ changeset);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ int err;
+ u64 failed_start;
+
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ NULL, NULL, GFP_NOFS);
+ if (err == -EEXIST) {
+ if (failed_start > start)
+ clear_extent_bit(tree, start, failed_start - 1,
+ EXTENT_LOCKED, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * Either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ int err;
+ u64 failed_start;
+
+ while (1) {
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ &failed_start, cached_state, NULL,
+ GFP_NOFS);
+ if (err == -EEXIST) {
+ wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+ start = failed_start;
+ } else
+ break;
+ WARN_ON(start > end);
+ }
+ return err;
+}
+
+void __cold extent_state_free_cachep(void)
+{
+ btrfs_extent_state_leak_debug_check();
+ kmem_cache_destroy(extent_state_cache);
+}
+
+int __init extent_state_init_cachep(void)
+{
+ extent_state_cache = kmem_cache_create("btrfs_extent_state",
+ sizeof(struct extent_state), 0,
+ SLAB_MEM_SPREAD, NULL);
+ if (!extent_state_cache)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index c3eb52dbe61c..a855f40dd61d 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -17,7 +17,6 @@ struct io_failure_record;
#define EXTENT_NODATASUM (1U << 7)
#define EXTENT_CLEAR_META_RESV (1U << 8)
#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_DAMAGED (1U << 10)
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
@@ -35,10 +34,18 @@ struct io_failure_record;
* delalloc bytes decremented, in an atomic way to prevent races with stat(2).
*/
#define EXTENT_ADD_INODE_BYTES (1U << 15)
+
+/*
+ * Set during truncate when we're clearing an entire range and we just want the
+ * extent states to go away.
+ */
+#define EXTENT_CLEAR_ALL_BITS (1U << 16)
+
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
- EXTENT_ADD_INODE_BYTES)
+ EXTENT_ADD_INODE_BYTES | \
+ EXTENT_CLEAR_ALL_BITS)
/*
* Redefined bits above which are used only in the device allocation tree,
@@ -56,7 +63,6 @@ enum {
IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
- IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
@@ -70,8 +76,6 @@ struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
void *private_data;
- u64 dirty_bytes;
- bool track_uptodate;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -89,33 +93,23 @@ struct extent_state {
refcount_t refs;
u32 state;
- struct io_failure_record *failrec;
-
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
-int __init extent_state_cache_init(void);
-void __cold extent_state_cache_exit(void);
-
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data);
void extent_io_tree_release(struct extent_io_tree *tree);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
+int __init extent_state_init_cachep(void);
+void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
@@ -126,72 +120,66 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
+ u32 bits, struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+ return __clear_extent_bit(tree, start, end, bits, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_NOFS, NULL);
}
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
+static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+ GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+ return clear_extent_bit(tree, start, end, bits, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, unsigned exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits);
+ u32 bits, struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
+}
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- NULL);
+ return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
- mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
+ EXTENT_DO_ACCOUNTING, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -211,30 +199,29 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | extra_bits,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- 0, NULL, cached_state, GFP_NOFS, NULL);
+ EXTENT_DELALLOC | EXTENT_DEFRAG,
+ cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
- GFP_NOFS, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- cached_state, mask, NULL);
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
+ cached_state, mask);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -244,24 +231,9 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidate_folio(struct extent_io_tree *tree,
- struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-
-/* This should be reworked in the future and put elsewhere. */
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec);
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f477035a2ac2..cd2d36580f1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -895,7 +895,13 @@ again:
err = -ENOENT;
while (1) {
if (ptr >= end) {
- WARN_ON(ptr > end);
+ if (ptr > end) {
+ err = -EUCLEAN;
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_crit(fs_info,
+"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+ path->slots[0], root_objectid, owner, offset, parent);
+ }
break;
}
iref = (struct btrfs_extent_inline_ref *)ptr;
@@ -1239,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (size) {
ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += size;
else if (ret != -EOPNOTSUPP)
@@ -1256,14 +1262,14 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
if (bytes_left) {
ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
*discarded_bytes += bytes_left;
}
return ret;
}
-static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1291,7 +1297,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
&discarded);
discarded += src_disc;
- } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
+ } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
} else {
ret = 0;
@@ -1310,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
- struct btrfs_io_context *bioc = NULL;
/*
- * Avoid races with device replace and make sure our bioc has devices
- * associated to its stripes that don't go away while we are discarding.
+ * Avoid races with device replace and make sure the devices in the
+ * stripes don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
- struct btrfs_io_stripe *stripe;
+ struct btrfs_discard_stripe *stripes;
+ unsigned int num_stripes;
int i;
num_bytes = end - cur;
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
- &num_bytes, &bioc, 0);
- /*
- * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
- * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
- * thus we can't continue anyway.
- */
- if (ret < 0)
- goto out;
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ if (IS_ERR(stripes)) {
+ ret = PTR_ERR(stripes);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
+ break;
+ }
- stripe = bioc->stripes;
- for (i = 0; i < bioc->num_stripes; i++, stripe++) {
+ for (i = 0; i < num_stripes; i++) {
+ struct btrfs_discard_stripe *stripe = stripes + i;
u64 bytes;
- struct btrfs_device *device = stripe->dev;
- if (!device->bdev) {
+ if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+ &stripe->dev->dev_state))
continue;
ret = do_discard_extent(stripe, &bytes);
- if (!ret) {
- discarded_bytes += bytes;
- } else if (ret != -EOPNOTSUPP) {
+ if (ret) {
/*
- * Logic errors or -ENOMEM, or -EIO, but
- * unlikely to happen.
- *
- * And since there are two loops, explicitly
- * go to out to avoid confusion.
+ * Keep going if discard is not supported by the
+ * device.
*/
- btrfs_put_bioc(bioc);
- goto out;
+ if (ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
+ } else {
+ discarded_bytes += bytes;
}
-
- /*
- * Just in case we get back EOPNOTSUPP for some reason,
- * just ignore the return value so we don't screw up
- * people calling discard_extent.
- */
- ret = 0;
}
- btrfs_put_bioc(bioc);
+ kfree(stripes);
+ if (ret)
+ break;
cur += num_bytes;
}
-out:
btrfs_bio_counter_dec(fs_info);
-
if (actual_bytes)
*actual_bytes = discarded_bytes;
-
-
- if (ret == -EOPNOTSUPP)
- ret = 0;
return ret;
}
@@ -1577,12 +1567,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
u32 item_size;
int ret;
int err = 0;
- int metadata = !extent_op->is_data;
+ int metadata = 1;
if (TRANS_ABORTED(trans))
return 0;
- if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
metadata = 0;
path = btrfs_alloc_path();
@@ -2180,7 +2170,7 @@ out:
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags,
- int level, int is_data)
+ int level)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
@@ -2192,7 +2182,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
@@ -2231,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
+ if (path->nowait) {
+ spin_unlock(&delayed_refs->lock);
+ btrfs_put_transaction(cur_trans);
+ return -EAGAIN;
+ }
+
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
@@ -2357,15 +2352,10 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr, bool strict)
+ u64 bytenr, bool strict, struct btrfs_path *path)
{
- struct btrfs_path *path;
int ret;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
do {
ret = check_committed_ref(root, path, objectid,
offset, bytenr, strict);
@@ -2376,7 +2366,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
} while (ret == -EAGAIN);
out:
- btrfs_free_path(path);
+ btrfs_release_path(path);
if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
@@ -2497,24 +2487,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
return ret;
}
-static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
+static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group *cache;
- u64 bytenr;
-
- spin_lock(&fs_info->block_group_cache_lock);
- bytenr = fs_info->first_logical_byte;
- spin_unlock(&fs_info->block_group_cache_lock);
+ struct rb_node *leftmost;
+ u64 bytenr = 0;
- if (bytenr < (u64)-1)
- return bytenr;
-
- cache = btrfs_lookup_first_block_group(fs_info, search_start);
- if (!cache)
- return 0;
+ read_lock(&fs_info->block_group_cache_lock);
+ /* Get the block group with the lowest logical start address. */
+ leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
+ if (leftmost) {
+ struct btrfs_block_group *bg;
- bytenr = cache->start;
- btrfs_put_block_group(cache);
+ bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
+ bytenr = bg->start;
+ }
+ read_unlock(&fs_info->block_group_cache_lock);
return bytenr;
}
@@ -2570,17 +2557,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
return -EINVAL;
/*
- * pull in the free space cache (if any) so that our pin
- * removes the free space from the cache. We have load_only set
- * to one because the slow code to read in the free extents does check
- * the pinned extents.
- */
- btrfs_cache_block_group(cache, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
+ * Fully cache the free space first so that our pin removes the free space
+ * from the cache.
*/
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret)
goto out;
@@ -2603,12 +2583,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 1);
- /*
- * Make sure we wait until the cache is completely built in case it is
- * missing or is invalid and therefore needs to be rebuilt.
- */
- ret = btrfs_wait_block_group_cache_done(block_group);
+ ret = btrfs_cache_block_group(block_group, true);
if (ret)
goto out;
@@ -2717,13 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- down_read(&fs_info->commit_root_sem);
- if (start < cache->last_byte_to_unpin && return_free_space) {
- u64 add_len = min(len, cache->last_byte_to_unpin - start);
-
- btrfs_add_free_space(cache, start, add_len);
- }
- up_read(&fs_info->commit_root_sem);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
start += len;
total_unpinned += len;
@@ -3803,8 +3773,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
/* Check RO and no space case before trying to activate it */
spin_lock(&block_group->lock);
- if (block_group->ro ||
- block_group->alloc_offset == block_group->zone_capacity) {
+ if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
ret = 1;
/*
* May need to clear fs_info->{treelog,data_reloc}_bg.
@@ -3836,7 +3805,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
- if (block_group->ro) {
+ if (block_group->ro ||
+ test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
ret = 1;
goto out;
}
@@ -3898,8 +3868,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
- if (ret && ffe_ctl->for_data_reloc)
+ if (ret && ffe_ctl->for_data_reloc &&
+ fs_info->data_reloc_bg == block_group->start) {
+ /*
+ * Do not allow further allocations from this block group.
+ * Compared to increasing the ->ro, setting the
+ * ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ */
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
fs_info->data_reloc_bg = 0;
+ }
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
@@ -3969,23 +3955,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
}
}
-static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
- struct find_free_extent_ctl *ffe_ctl)
+static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ /* If we can activate new zone, just allocate a chunk and use it */
+ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+ return 0;
+
+ /*
+ * We already reached the max active zones. Try to finish one block
+ * group to make a room for a new block group. This is only possible
+ * for a data block group because btrfs_zone_finish() may need to wait
+ * for a running transaction which can cause a deadlock for metadata
+ * allocation.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ int ret = btrfs_zone_finish_one_bg(fs_info);
+
+ if (ret == 1)
+ return 0;
+ else if (ret < 0)
+ return ret;
+ }
+
+ /*
+ * If we have enough free space left in an already active block group
+ * and we can't activate any other zone now, do not allow allocating a
+ * new chunk and let find_free_extent() retry with a smaller size.
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+ return -ENOSPC;
+
+ /*
+ * Even min_alloc_size is not left in any block groups. Since we cannot
+ * activate a new block group, allocating it may not help. Let's tell a
+ * caller to try again and hope it progress something by writing some
+ * parts of the region. That is only possible for data block groups,
+ * where a part of the region can be written.
+ */
+ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
+ return -EAGAIN;
+
+ /*
+ * We cannot activate a new block group and no enough space left in any
+ * block groups. So, allocating a new block group may not help. But,
+ * there is nothing to do anyway, so let's go with it.
+ */
+ return 0;
+}
+
+static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
- return true;
+ return 0;
case BTRFS_EXTENT_ALLOC_ZONED:
- /*
- * If we have enough free space left in an already
- * active block group and we can't activate any other
- * zone now, do not allow allocating a new chunk and
- * let find_free_extent() retry with a smaller size.
- */
- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
- return false;
- return true;
+ return can_allocate_chunk_zoned(fs_info, ffe_ctl);
default:
BUG();
}
@@ -4067,8 +4093,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
int exist = 0;
/*Check if allocation policy allows to create a new chunk */
- if (!can_allocate_chunk(fs_info, ffe_ctl))
- return -ENOSPC;
+ ret = can_allocate_chunk(fs_info, ffe_ctl);
+ if (ret)
+ return ret;
trans = current->journal_info;
if (trans)
@@ -4082,7 +4109,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
- CHUNK_ALLOC_FORCE);
+ CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
if (ret == -ENOSPC)
@@ -4272,7 +4299,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
return ret;
ffe_ctl->search_start = max(ffe_ctl->search_start,
- first_logical_byte(fs_info, 0));
+ first_logical_byte(fs_info));
ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
@@ -4362,7 +4389,7 @@ have_block_group:
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
- ret = btrfs_cache_block_group(block_group, 0);
+ ret = btrfs_cache_block_group(block_group, false);
/*
* If we get ENOMEM here or something else we want to
@@ -4830,6 +4857,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
+ u64 lockdep_owner = owner;
buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
@@ -4849,11 +4877,29 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ *
+ * The exception however is in replace_path() in relocation, where we
+ * hold the lock on the original fs root and then search for the reloc
+ * root. At that point we need to make sure any reloc root buffers are
+ * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
+ * lockdep happy.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ /* btrfs_clean_tree_block() accesses generation field. */
+ btrfs_set_header_generation(buf, trans->transid);
+
+ /*
* This needs to stay, because we could allocate a freed block from an
* old tree into a new tree, so we need to make sure this new block is
* set to the appropriate level and owner.
*/
- btrfs_set_buffer_lockdep_class(owner, buf, level);
+ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
+
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -4959,7 +5005,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_key = skinny_metadata ? false : true;
extent_op->update_flags = true;
- extent_op->is_data = false;
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
@@ -5144,7 +5189,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, eb, flag,
- btrfs_header_level(eb), 0);
+ btrfs_header_level(eb));
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@@ -5599,6 +5644,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
+ const bool is_reloc_root = (root->root_key.objectid ==
+ BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5758,6 +5805,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
@@ -5792,7 +5842,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
@@ -5824,6 +5874,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_put_root(root);
root_dropped = true;
out_end_trans:
+ if (!is_reloc_root)
+ btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
@@ -5981,13 +6034,13 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
*/
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
- u64 start = SZ_1M, len = 0, end = 0;
+ u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
int ret;
*trimmed = 0;
/* Discard not supported = nothing to do. */
- if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ if (!bdev_max_discard_sectors(device->bdev))
return 0;
/* Not writable = nothing to do. */
@@ -6025,8 +6078,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
break;
}
- /* Ensure we skip the reserved area in the first 1M */
- start = max_t(u64, start, SZ_1M);
+ /* Ensure we skip the reserved space on each device. */
+ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
/*
* If find_first_clear_extent_bit find a range that spans the
@@ -6117,13 +6170,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
- ret = btrfs_cache_block_group(cache, 0);
- if (ret) {
- bg_failed++;
- bg_ret = ret;
- continue;
- }
- ret = btrfs_wait_block_group_cache_done(cache);
+ ret = btrfs_cache_block_group(cache, true);
if (ret) {
bg_failed++;
bg_ret = ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 724e8fe06aa0..1eae68fbae21 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -6,6 +6,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
+#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
@@ -28,39 +29,29 @@
#include "subpage.h"
#include "zoned.h"
#include "block-group.h"
+#include "compression.h"
-static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
-static struct bio_set btrfs_bioset;
-
-static inline bool extent_state_in_tree(const struct extent_state *state)
-{
- return !RB_EMPTY_NODE(&state->rb_node);
-}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(states);
-static DEFINE_SPINLOCK(leak_lock);
-
-static inline void btrfs_leak_debug_add(spinlock_t *lock,
- struct list_head *new,
- struct list_head *head)
+static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_add(new, head);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_add(&eb->leak_list, &fs_info->allocated_ebs);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-static inline void btrfs_leak_debug_del(spinlock_t *lock,
- struct list_head *entry)
+static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
unsigned long flags;
- spin_lock_irqsave(lock, flags);
- list_del(entry);
- spin_unlock_irqrestore(lock, flags);
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ list_del(&eb->leak_list);
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
@@ -75,6 +66,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
if (!fs_info->allocated_ebs.next)
return;
+ WARN_ON(!list_empty(&fs_info->allocated_ebs));
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
@@ -88,51 +80,22 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
-
-static inline void btrfs_extent_state_leak_debug_check(void)
-{
- struct extent_state *state;
-
- while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
- state->start, state->end, state->state,
- extent_state_in_tree(state),
- refcount_read(&state->refs));
- list_del(&state->leak_list);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-#define btrfs_debug_check_extent_io_range(tree, start, end) \
- __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
-static inline void __btrfs_debug_check_extent_io_range(const char *caller,
- struct extent_io_tree *tree, u64 start, u64 end)
-{
- struct inode *inode = tree->private_data;
- u64 isize;
-
- if (!inode || !is_data_inode(inode))
- return;
-
- isize = i_size_read(inode);
- if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
- "%s: ino %llu isize %llu odd range [%llu,%llu]",
- caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
- }
-}
#else
-#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
-#define btrfs_leak_debug_del(lock, entry) do {} while (0)
-#define btrfs_extent_state_leak_debug_check() do {} while (0)
-#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
+#define btrfs_leak_debug_add_eb(eb) do {} while (0)
+#define btrfs_leak_debug_del_eb(eb) do {} while (0)
#endif
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
+/*
+ * Structure to record info about the bio being assembled, and other info like
+ * how many bytes are there before stripe/ordered extent boundary.
+ */
+struct btrfs_bio_ctrl {
+ struct bio *bio;
+ int mirror_num;
+ enum btrfs_compression_type compress_type;
+ u32 len_to_stripe_boundary;
+ u32 len_to_oe_boundary;
+ btrfs_bio_end_io_t end_io_func;
};
struct extent_page_data {
@@ -146,92 +109,59 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, u32 bits,
- struct extent_changeset *changeset,
- int set)
+static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
- int ret;
-
- if (!changeset)
- return 0;
- if (set && (state->state & bits) == bits)
- return 0;
- if (!set && (state->state & bits) == 0)
- return 0;
- changeset->bytes_changed += state->end - state->start + 1;
- ret = ulist_add(&changeset->range_changed, state->start, state->end,
- GFP_ATOMIC);
- return ret;
-}
+ struct bio *bio;
+ struct bio_vec *bv;
+ struct inode *inode;
+ int mirror_num;
-int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
-{
- blk_status_t ret = 0;
- struct extent_io_tree *tree = bio->bi_private;
+ if (!bio_ctrl->bio)
+ return;
- bio->bi_private = NULL;
+ bio = bio_ctrl->bio;
+ bv = bio_first_bvec_all(bio);
+ inode = bv->bv_page->mapping->host;
+ mirror_num = bio_ctrl->mirror_num;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
- if (is_data_inode(tree->private_data))
- ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
- bio_flags);
- else
- ret = btrfs_submit_metadata_bio(tree->private_data, bio,
- mirror_num, bio_flags);
- return blk_status_to_errno(ret);
-}
+ btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
-/* Cleanup unsubmitted bios */
-static void end_write_bio(struct extent_page_data *epd, int ret)
-{
- struct bio *bio = epd->bio_ctrl.bio;
+ if (!is_data_inode(inode))
+ btrfs_submit_metadata_bio(inode, bio, mirror_num);
+ else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_submit_data_write_bio(inode, bio, mirror_num);
+ else
+ btrfs_submit_data_read_bio(inode, bio, mirror_num,
+ bio_ctrl->compress_type);
- if (bio) {
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
- epd->bio_ctrl.bio = NULL;
- }
+ /* The bio is owned by the end_io handler now */
+ bio_ctrl->bio = NULL;
}
/*
- * Submit bio from extent page data via submit_one_bio
- *
- * Return 0 if everything is OK.
- * Return <0 for error.
+ * Submit or fail the current bio in an extent_page_data structure.
*/
-static int __must_check flush_write_bio(struct extent_page_data *epd)
+static void submit_write_bio(struct extent_page_data *epd, int ret)
{
- int ret = 0;
struct bio *bio = epd->bio_ctrl.bio;
- if (bio) {
- ret = submit_one_bio(bio, 0, 0);
- /*
- * Clean up of epd->bio is handled by its endio function.
- * And endio is either triggered by successful bio execution
- * or the error handler of submit bio hook.
- * So at this point, no matter what happened, we don't need
- * to clean up epd->bio.
- */
+ if (!bio)
+ return;
+
+ if (ret) {
+ ASSERT(ret < 0);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ /* The bio is owned by the end_io handler now */
epd->bio_ctrl.bio = NULL;
+ } else {
+ submit_one_bio(&epd->bio_ctrl);
}
- return ret;
}
-int __init extent_state_cache_init(void)
-{
- extent_state_cache = kmem_cache_create("btrfs_extent_state",
- sizeof(struct extent_state), 0,
- SLAB_MEM_SPREAD, NULL);
- if (!extent_state_cache)
- return -ENOMEM;
- return 0;
-}
-
-int __init extent_io_init(void)
+int __init extent_buffer_init_cachep(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
@@ -239,32 +169,10 @@ int __init extent_io_init(void)
if (!extent_buffer_cache)
return -ENOMEM;
- if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_bio, bio),
- BIOSET_NEED_BVECS))
- goto free_buffer_cache;
-
- if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
- goto free_bioset;
-
return 0;
-
-free_bioset:
- bioset_exit(&btrfs_bioset);
-
-free_buffer_cache:
- kmem_cache_destroy(extent_buffer_cache);
- extent_buffer_cache = NULL;
- return -ENOMEM;
}
-void __cold extent_state_cache_exit(void)
-{
- btrfs_extent_state_leak_debug_check();
- kmem_cache_destroy(extent_state_cache);
-}
-
-void __cold extent_io_exit(void)
+void __cold extent_buffer_free_cachep(void)
{
/*
* Make sure all delayed rcu free are flushed before we
@@ -272,1222 +180,6 @@ void __cold extent_io_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
- bioset_exit(&btrfs_bioset);
-}
-
-/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
- */
-static struct lock_class_key file_extent_tree_class;
-
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data)
-{
- tree->fs_info = fs_info;
- tree->state = RB_ROOT;
- tree->dirty_bytes = 0;
- spin_lock_init(&tree->lock);
- tree->private_data = private_data;
- tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
-}
-
-void extent_io_tree_release(struct extent_io_tree *tree)
-{
- spin_lock(&tree->lock);
- /*
- * Do a single barrier for the waitqueue_active check here, the state
- * of the waitqueue should not change once extent_io_tree_release is
- * called.
- */
- smp_mb();
- while (!RB_EMPTY_ROOT(&tree->state)) {
- struct rb_node *node;
- struct extent_state *state;
-
- node = rb_first(&tree->state);
- state = rb_entry(node, struct extent_state, rb_node);
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- /*
- * btree io trees aren't supposed to have tasks waiting for
- * changes in the flags of extent states ever.
- */
- ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
-
- cond_resched_lock(&tree->lock);
- }
- spin_unlock(&tree->lock);
-}
-
-static struct extent_state *alloc_extent_state(gfp_t mask)
-{
- struct extent_state *state;
-
- /*
- * The given mask might be not appropriate for the slab allocator,
- * drop the unsupported bits
- */
- mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
- state = kmem_cache_alloc(extent_state_cache, mask);
- if (!state)
- return state;
- state->state = 0;
- state->failrec = NULL;
- RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
- refcount_set(&state->refs, 1);
- init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
- return state;
-}
-
-void free_extent_state(struct extent_state *state)
-{
- if (!state)
- return;
- if (refcount_dec_and_test(&state->refs)) {
- WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&leak_lock, &state->leak_list);
- trace_free_extent_state(state, _RET_IP_);
- kmem_cache_free(extent_state_cache, state);
- }
-}
-
-static struct rb_node *tree_insert(struct rb_root *root,
- struct rb_node *search_start,
- u64 offset,
- struct rb_node *node,
- struct rb_node ***p_in,
- struct rb_node **parent_in)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct tree_entry *entry;
-
- if (p_in && parent_in) {
- p = *p_in;
- parent = *parent_in;
- goto do_insert;
- }
-
- p = search_start ? &search_start : &root->rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct tree_entry, rb_node);
-
- if (offset < entry->start)
- p = &(*p)->rb_left;
- else if (offset > entry->end)
- p = &(*p)->rb_right;
- else
- return parent;
- }
-
-do_insert:
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
-/**
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
- *
- * @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
- * @next_ret: pointer to the first entry whose range ends after @offset
- * @prev_ret: pointer to the first entry whose range begins before @offset
- * @p_ret: pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret: points to entry which would have been the parent of the entry,
- * containing @offset
- *
- * This function returns a pointer to the entry that contains @offset byte
- * address. If no such entry exists, then NULL is returned and the other
- * pointer arguments to the function are filled, otherwise the found entry is
- * returned and other pointers are left untouched.
- */
-static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **next_ret,
- struct rb_node **prev_ret,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
-{
- struct rb_root *root = &tree->state;
- struct rb_node **n = &root->rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *orig_prev = NULL;
- struct tree_entry *entry;
- struct tree_entry *prev_entry = NULL;
-
- while (*n) {
- prev = *n;
- entry = rb_entry(prev, struct tree_entry, rb_node);
- prev_entry = entry;
-
- if (offset < entry->start)
- n = &(*n)->rb_left;
- else if (offset > entry->end)
- n = &(*n)->rb_right;
- else
- return *n;
- }
-
- if (p_ret)
- *p_ret = n;
- if (parent_ret)
- *parent_ret = prev;
-
- if (next_ret) {
- orig_prev = prev;
- while (prev && offset > prev_entry->end) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *next_ret = prev;
- prev = orig_prev;
- }
-
- if (prev_ret) {
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct tree_entry, rb_node);
- }
- *prev_ret = prev;
- }
- return NULL;
-}
-
-static inline struct rb_node *
-tree_search_for_insert(struct extent_io_tree *tree,
- u64 offset,
- struct rb_node ***p_ret,
- struct rb_node **parent_ret)
-{
- struct rb_node *next= NULL;
- struct rb_node *ret;
-
- ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
- if (!ret)
- return next;
- return ret;
-}
-
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
- u64 offset)
-{
- return tree_search_for_insert(tree, offset, NULL, NULL);
-}
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree. Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static void merge_state(struct extent_io_tree *tree,
- struct extent_state *state)
-{
- struct extent_state *other;
- struct rb_node *other_node;
-
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- return;
-
- other_node = rb_prev(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->end == state->start - 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->start = other->start;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
- other_node = rb_next(&state->rb_node);
- if (other_node) {
- other = rb_entry(other_node, struct extent_state, rb_node);
- if (other->start == state->end + 1 &&
- other->state == state->state) {
- if (tree->private_data &&
- is_data_inode(tree->private_data))
- btrfs_merge_delalloc_extent(tree->private_data,
- state, other);
- state->end = other->end;
- rb_erase(&other->rb_node, &tree->state);
- RB_CLEAR_NODE(&other->rb_node);
- free_extent_state(other);
- }
- }
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, u32 *bits,
- struct extent_changeset *changeset);
-
-/*
- * insert an extent_state struct into the tree. 'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally. This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_io_tree *tree,
- struct extent_state *state, u64 start, u64 end,
- struct rb_node ***p,
- struct rb_node **parent,
- u32 *bits, struct extent_changeset *changeset)
-{
- struct rb_node *node;
-
- if (end < start) {
- btrfs_err(tree->fs_info,
- "insert state: end < start %llu %llu", end, start);
- WARN_ON(1);
- }
- state->start = start;
- state->end = end;
-
- set_state_bits(tree, state, bits, changeset);
-
- node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
- if (node) {
- struct extent_state *found;
- found = rb_entry(node, struct extent_state, rb_node);
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- found->start, found->end, start, end);
- return -EEXIST;
- }
- merge_state(tree, state);
- return 0;
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half. 'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end]. After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
- struct extent_state *prealloc, u64 split)
-{
- struct rb_node *node;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_split_delalloc_extent(tree->private_data, orig, split);
-
- prealloc->start = orig->start;
- prealloc->end = split - 1;
- prealloc->state = orig->state;
- orig->start = split;
-
- node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
- &prealloc->rb_node, NULL, NULL);
- if (node) {
- free_extent_state(prealloc);
- return -EEXIST;
- }
- return 0;
-}
-
-static struct extent_state *next_state(struct extent_state *state)
-{
- struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up anyone waiting on this state (wake == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 *bits, int wake,
- struct extent_changeset *changeset)
-{
- struct extent_state *next;
- u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
- int ret;
-
- if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- WARN_ON(range > tree->dirty_bytes);
- tree->dirty_bytes -= range;
- }
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_clear_delalloc_extent(tree->private_data, state, bits);
-
- ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
- BUG_ON(ret < 0);
- state->state &= ~bits_to_clear;
- if (wake)
- wake_up(&state->wq);
- if (state->state == 0) {
- next = next_state(state);
- if (extent_state_in_tree(state)) {
- rb_erase(&state->rb_node, &tree->state);
- RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
- } else {
- WARN_ON(1);
- }
- } else {
- merge_state(tree, state);
- next = next_state(state);
- }
- return next;
-}
-
-static struct extent_state *
-alloc_extent_state_atomic(struct extent_state *prealloc)
-{
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
-
- return prealloc;
-}
-
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
-{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
-}
-
-/*
- * clear some bits on a range in the tree. This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns 0 on success and < 0 on error.
- */
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *cached;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- u64 last_end;
- int err;
- int clear = 0;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
-
- if (bits & EXTENT_DELALLOC)
- bits |= EXTENT_NORESERVE;
-
- if (delete)
- bits |= ~EXTENT_CTLBITS;
-
- if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
- clear = 1;
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state) {
- cached = *cached_state;
-
- if (clear) {
- *cached_state = NULL;
- cached_state = NULL;
- }
-
- if (cached && extent_state_in_tree(cached) &&
- cached->start <= start && cached->end > start) {
- if (clear)
- refcount_dec(&cached->refs);
- state = cached;
- goto hit_next;
- }
- if (clear)
- free_extent_state(cached);
- }
- /*
- * this search will find the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- if (state->start > end)
- goto out;
- WARN_ON(state->end < start);
- last_end = state->end;
-
- /* the state doesn't have the wanted bits, go ahead */
- if (!(state->state & bits)) {
- state = next_state(state);
- goto next;
- }
-
- /*
- * | ---- desired range ---- |
- * | state | or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip
- * bits on second half.
- *
- * If the extent we found extends past our range, we
- * just split and search again. It'll get split again
- * the next time though.
- *
- * If the extent we found is inside our range, we clear
- * the desired bit on it.
- */
-
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake,
- changeset);
- goto next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and clear the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- if (wake)
- wake_up(&state->wq);
-
- clear_state_bit(tree, prealloc, &bits, wake, changeset);
-
- prealloc = NULL;
- goto out;
- }
-
- state = clear_state_bit(tree, state, &bits, wake, changeset);
-next:
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start <= end && state && !need_resched())
- goto hit_next;
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return 0;
-
-}
-
-static void wait_on_state(struct extent_io_tree *tree,
- struct extent_state *state)
- __releases(tree->lock)
- __acquires(tree->lock)
-{
- DEFINE_WAIT(wait);
- prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
- schedule();
- spin_lock(&tree->lock);
- finish_wait(&state->wq, &wait);
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits)
-{
- struct extent_state *state;
- struct rb_node *node;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
-
- spin_lock(&tree->lock);
-again:
- while (1) {
- /*
- * this search will find all the extents that end after
- * our range starts
- */
- node = tree_search(tree, start);
-process_node:
- if (!node)
- break;
-
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (state->start > end)
- goto out;
-
- if (state->state & bits) {
- start = state->start;
- refcount_inc(&state->refs);
- wait_on_state(tree, state);
- free_extent_state(state);
- goto again;
- }
- start = state->end + 1;
-
- if (start > end)
- break;
-
- if (!cond_resched_lock(&tree->lock)) {
- node = rb_next(node);
- goto process_node;
- }
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state,
- u32 *bits, struct extent_changeset *changeset)
-{
- u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
- int ret;
-
- if (tree->private_data && is_data_inode(tree->private_data))
- btrfs_set_delalloc_extent(tree->private_data, state, bits);
-
- if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
- u64 range = state->end - state->start + 1;
- tree->dirty_bytes += range;
- }
- ret = add_extent_changeset(state, bits_to_set, changeset, 1);
- BUG_ON(ret < 0);
- state->state |= bits_to_set;
-}
-
-static void cache_state_if_flags(struct extent_state *state,
- struct extent_state **cached_ptr,
- unsigned flags)
-{
- if (cached_ptr && !(*cached_ptr)) {
- if (!flags || (state->state & flags)) {
- *cached_ptr = state;
- refcount_inc(&state->refs);
- }
- }
-}
-
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
-{
- return cache_state_if_flags(state, cached_ptr,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
-}
-
-/*
- * set some bits on a range in the tree. This may require allocations or
- * sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set. The start of the
- * existing range is returned in failed_start in this case.
- *
- * [start, end] is inclusive This takes the tree lock.
- */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- u32 exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask,
- struct extent_changeset *changeset)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
-
- if (exclusive_bits)
- ASSERT(failed_start);
- else
- ASSERT(failed_start == NULL);
-again:
- if (!prealloc && gfpflags_allow_blocking(mask)) {
- /*
- * Don't care for allocation failure here because we might end
- * up not needing the pre-allocated extent state at all, which
- * is the case if we only have in the tree extent states that
- * cover our input range and don't cover too any other range.
- * If we end up needing a new extent state we allocate it later.
- */
- prealloc = alloc_extent_state(mask);
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- if (state->state & exclusive_bits) {
- *failed_start = state->start;
- err = -EEXIST;
- goto out;
- }
-
- set_state_bits(tree, state, &bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- /*
- * If this extent already has all the bits we want set, then
- * skip it, not necessary to split it or do anything with it.
- */
- if ((state->state & bits) == bits) {
- start = state->end + 1;
- cache_state(state, cached_state);
- goto search_again;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
-
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, &bits, changeset);
- cache_state(state, cached_state);
- merge_state(tree, state);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, changeset);
- if (err)
- extent_io_tree_panic(tree, err);
-
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- if (state->state & exclusive_bits) {
- *failed_start = start;
- err = -EEXIST;
- goto out;
- }
-
- prealloc = alloc_extent_state_atomic(prealloc);
- BUG_ON(!prealloc);
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, &bits, changeset);
- cache_state(prealloc, cached_state);
- merge_state(tree, prealloc);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- if (gfpflags_allow_blocking(mask))
- cond_resched();
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-
-}
-
-/**
- * convert_extent_bit - convert all bits in a given range from one bit to
- * another
- * @tree: the io tree to search
- * @start: the start offset in bytes
- * @end: the end offset in bytes (inclusive)
- * @bits: the bits to set in this range
- * @clear_bits: the bits to clear in this range
- * @cached_state: state that we're going to cache
- *
- * This will go through and set bits for the given range. If any states exist
- * already in this range they are set with the given bit and cleared of the
- * clear_bits. This is only meant to be used by things that are mergeable, ie
- * converting from say DELALLOC to DIRTY. This is not meant to be used with
- * boundary bits like LOCK.
- *
- * All allocations are done with GFP_NOFS.
- */
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- struct extent_state *prealloc = NULL;
- struct rb_node *node;
- struct rb_node **p;
- struct rb_node *parent;
- int err = 0;
- u64 last_start;
- u64 last_end;
- bool first_iteration = true;
-
- btrfs_debug_check_extent_io_range(tree, start, end);
- trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
- clear_bits);
-
-again:
- if (!prealloc) {
- /*
- * Best effort, don't worry if extent state allocation fails
- * here for the first iteration. We might have a cached state
- * that matches exactly the target range, in which case no
- * extent state allocations are needed. We'll only know this
- * after locking the tree.
- */
- prealloc = alloc_extent_state(GFP_NOFS);
- if (!prealloc && !first_iteration)
- return -ENOMEM;
- }
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->start <= start && state->end > start &&
- extent_state_in_tree(state)) {
- node = &state->rb_node;
- goto hit_next;
- }
- }
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search_for_insert(tree, start, &p, &parent);
- if (!node) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
- last_start = state->start;
- last_end = state->end;
-
- /*
- * | ---- desired range ---- |
- * | state |
- *
- * Just lock what we found and keep going
- */
- if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- goto search_again;
- }
-
- /*
- * | ---- desired range ---- |
- * | state |
- * or
- * | ------------- state -------------- |
- *
- * We need to split the extent we found, and may flip bits on
- * second half.
- *
- * If the extent we found extends past our
- * range, we just split and search again. It'll get split
- * again the next time though.
- *
- * If the extent we found is inside our range, we set the
- * desired bit on it.
- */
- if (state->start < start) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, err);
- prealloc = NULL;
- if (err)
- goto out;
- if (state->end <= end) {
- set_state_bits(tree, state, &bits, NULL);
- cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0,
- NULL);
- if (last_end == (u64)-1)
- goto out;
- start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
- goto hit_next;
- }
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state | or | state |
- *
- * There's a hole, we need to insert something in it and
- * ignore the extent we found.
- */
- if (state->start > start) {
- u64 this_end;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- /*
- * Avoid to free 'prealloc' if it can be merged with
- * the later extent.
- */
- err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits, NULL);
- if (err)
- extent_io_tree_panic(tree, err);
- cache_state(prealloc, cached_state);
- prealloc = NULL;
- start = this_end + 1;
- goto search_again;
- }
- /*
- * | ---- desired range ---- |
- * | state |
- * We need to split the extent, and set the bit
- * on the first half
- */
- if (state->start <= end && state->end > end) {
- prealloc = alloc_extent_state_atomic(prealloc);
- if (!prealloc) {
- err = -ENOMEM;
- goto out;
- }
-
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, err);
-
- set_state_bits(tree, prealloc, &bits, NULL);
- cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
- prealloc = NULL;
- goto out;
- }
-
-search_again:
- if (start > end)
- goto out;
- spin_unlock(&tree->lock);
- cond_resched();
- first_iteration = false;
- goto again;
-
-out:
- spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
-
- return err;
-}
-
-/* wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
-{
- /*
- * We don't support EXTENT_LOCKED yet, as current changeset will
- * record any bits changed, so for EXTENT_LOCKED case, it will
- * either fail with -EEXIST or changeset will record the whole
- * range.
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
-}
-
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits)
-{
- return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
-}
-
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int wake, int delete,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, bits, wake, delete,
- cached, GFP_NOFS, NULL);
-}
-
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
-{
- /*
- * Don't support EXTENT_LOCKED case, same reason as
- * set_record_extent_bits().
- */
- BUG_ON(bits & EXTENT_LOCKED);
-
- return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
- changeset);
-}
-
-/*
- * either insert or lock state struct between start and end use mask to tell
- * us if waiting is desired.
- */
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state)
-{
- int err;
- u64 failed_start;
-
- while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
- start = failed_start;
- } else
- break;
- WARN_ON(start > end);
- }
- return err;
-}
-
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- int err;
- u64 failed_start;
-
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
- if (err == -EEXIST) {
- if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, NULL);
- return 0;
- }
- return 1;
}
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
@@ -1521,295 +213,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
}
}
-/* find the first state struct with 'bits' set after 'start', and
- * return it. tree->lock must be held. NULL will returned if
- * nothing was found after 'start'
- */
-static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
-{
- struct rb_node *node;
- struct extent_state *state;
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node)
- goto out;
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && (state->state & bits))
- return state;
-
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- return NULL;
-}
-
-/*
- * Find the first offset in the io tree with one or more @bits set.
- *
- * Note: If there are multiple bits set in @bits, any of them will match.
- *
- * Return 0 if we find something, and update @start_ret and @end_ret.
- * Return 1 if we found nothing.
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state)
-{
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- if (cached_state && *cached_state) {
- state = *cached_state;
- if (state->end == start - 1 && extent_state_in_tree(state)) {
- while ((state = next_state(state)) != NULL) {
- if (state->state & bits)
- goto got_it;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
- goto out;
- }
- free_extent_state(*cached_state);
- *cached_state = NULL;
- }
-
- state = find_first_extent_bit_state(tree, start, bits);
-got_it:
- if (state) {
- cache_state_if_flags(state, cached_state, 0);
- *start_ret = state->start;
- *end_ret = state->end;
- ret = 0;
- }
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
-
-/**
- * Find a contiguous area of bits
- *
- * @tree: io tree to check
- * @start: offset to start the search from
- * @start_ret: the first offset we found with the bits set
- * @end_ret: the final contiguous range of the bits that were set
- * @bits: bits to look for
- *
- * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
- * to set bits appropriately, and then merge them again. During this time it
- * will drop the tree->lock, so use this helper if you want to find the actual
- * contiguous area for given bits. We will search to the first bit we find, and
- * then walk down the tree until we find a non-contiguous area. The area
- * returned will be the full contiguous area with the bits set.
- */
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
-{
- struct extent_state *state;
- int ret = 1;
-
- spin_lock(&tree->lock);
- state = find_first_extent_bit_state(tree, start, bits);
- if (state) {
- *start_ret = state->start;
- *end_ret = state->end;
- while ((state = next_state(state)) != NULL) {
- if (state->start > (*end_ret + 1))
- break;
- *end_ret = state->end;
- }
- ret = 0;
- }
- spin_unlock(&tree->lock);
- return ret;
-}
-
-/**
- * Find the first range that has @bits not set. This range could start before
- * @start.
- *
- * @tree: the tree to search
- * @start: offset at/after which the found extent should start
- * @start_ret: records the beginning of the range
- * @end_ret: records the end of the range (inclusive)
- * @bits: the set of bits which must be unset
- *
- * Since unallocated range is also considered one which doesn't have the bits
- * set it's possible that @end_ret contains -1, this happens in case the range
- * spans (last_range_end, end of device]. In this case it's up to the caller to
- * trim @end_ret to the appropriate size.
- */
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
-{
- struct extent_state *state;
- struct rb_node *node, *prev = NULL, *next;
-
- spin_lock(&tree->lock);
-
- /* Find first extent with bits cleared */
- while (1) {
- node = __etree_search(tree, start, &next, &prev, NULL, NULL);
- if (!node && !next && !prev) {
- /*
- * Tree is completely empty, send full range and let
- * caller deal with it
- */
- *start_ret = 0;
- *end_ret = -1;
- goto out;
- } else if (!node && !next) {
- /*
- * We are past the last allocated chunk, set start at
- * the end of the last extent.
- */
- state = rb_entry(prev, struct extent_state, rb_node);
- *start_ret = state->end + 1;
- *end_ret = -1;
- goto out;
- } else if (!node) {
- node = next;
- }
- /*
- * At this point 'node' either contains 'start' or start is
- * before 'node'
- */
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (in_range(start, state->start, state->end - state->start + 1)) {
- if (state->state & bits) {
- /*
- * |--range with bits sets--|
- * |
- * start
- */
- start = state->end + 1;
- } else {
- /*
- * 'start' falls within a range that doesn't
- * have the bits set, so take its start as
- * the beginning of the desired range
- *
- * |--range with bits cleared----|
- * |
- * start
- */
- *start_ret = state->start;
- break;
- }
- } else {
- /*
- * |---prev range---|---hole/unset---|---node range---|
- * |
- * start
- *
- * or
- *
- * |---hole/unset--||--first node--|
- * 0 |
- * start
- */
- if (prev) {
- state = rb_entry(prev, struct extent_state,
- rb_node);
- *start_ret = state->end + 1;
- } else {
- *start_ret = 0;
- }
- break;
- }
- }
-
- /*
- * Find the longest stretch from start until an entry which has the
- * bits set
- */
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->end >= start && !(state->state & bits)) {
- *end_ret = state->end;
- } else {
- *end_ret = state->start - 1;
- break;
- }
-
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
-}
-
-/*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'. start and end are used to return the range,
- *
- * true is returned if we find something, false if nothing was in the tree
- */
-bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
- u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
-{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- bool found = false;
- u64 total_bytes = 0;
-
- spin_lock(&tree->lock);
-
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node) {
- *end = (u64)-1;
- goto out;
- }
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (found && (state->start != cur_start ||
- (state->state & EXTENT_BOUNDARY))) {
- goto out;
- }
- if (!(state->state & EXTENT_DELALLOC)) {
- if (!found)
- *end = state->end;
- goto out;
- }
- if (!found) {
- *start = state->start;
- *cached_state = state;
- refcount_inc(&state->refs);
- }
- found = true;
- *end = state->end;
- cur_start = state->end + 1;
- node = rb_next(node);
- total_bytes += state->end - state->start + 1;
- if (total_bytes >= max_bytes)
- break;
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return found;
-}
-
/*
* Process one page for __process_pages_contig().
*
@@ -1992,10 +395,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
const u64 orig_start = *start;
const u64 orig_end = *end;
- u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
+ /* The sanity tests may not set a valid fs_info. */
+ u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
bool found;
@@ -2059,14 +464,14 @@ again:
}
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
+ lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
- unlock_extent_cached(tree, delalloc_start, delalloc_end,
- &cached_state);
+ unlock_extent(tree, delalloc_start, delalloc_end,
+ &cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
@@ -2083,210 +488,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops, NULL);
}
-/*
- * count the number of bytes in the tree that have a given bit(s)
- * set. This can be fairly slow, except for EXTENT_DIRTY which is
- * cached. The total number found is returned.
- */
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig)
+static int insert_failrec(struct btrfs_inode *inode,
+ struct io_failure_record *failrec)
{
- struct rb_node *node;
- struct extent_state *state;
- u64 cur_start = *start;
- u64 total_bytes = 0;
- u64 last = 0;
- int found = 0;
+ struct rb_node *exist;
- if (WARN_ON(search_end <= cur_start))
- return 0;
+ spin_lock(&inode->io_failure_lock);
+ exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+ &failrec->rb_node);
+ spin_unlock(&inode->io_failure_lock);
- spin_lock(&tree->lock);
- if (cur_start == 0 && bits == EXTENT_DIRTY) {
- total_bytes = tree->dirty_bytes;
- goto out;
- }
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, cur_start);
- if (!node)
- goto out;
-
- while (1) {
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start > search_end)
- break;
- if (contig && found && state->start > last + 1)
- break;
- if (state->end >= cur_start && (state->state & bits) == bits) {
- total_bytes += min(search_end, state->end) + 1 -
- max(cur_start, state->start);
- if (total_bytes >= max_bytes)
- break;
- if (!found) {
- *start = max(cur_start, state->start);
- found = 1;
- }
- last = state->end;
- } else if (contig && found) {
- break;
- }
- node = rb_next(node);
- if (!node)
- break;
- }
-out:
- spin_unlock(&tree->lock);
- return total_bytes;
+ return (exist == NULL) ? 0 : -EEXIST;
}
-/*
- * set the private field for a given byte offset in the tree. If there isn't
- * an extent_state there already, this does nothing.
- */
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
+static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
{
struct rb_node *node;
- struct extent_state *state;
- int ret = 0;
+ struct io_failure_record *failrec = ERR_PTR(-ENOENT);
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- ret = -ENOENT;
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- ret = -ENOENT;
- goto out;
- }
- state->failrec = failrec;
-out:
- spin_unlock(&tree->lock);
- return ret;
-}
-
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
-{
- struct rb_node *node;
- struct extent_state *state;
- struct io_failure_record *failrec;
-
- spin_lock(&tree->lock);
- /*
- * this search will find all the extents that end after
- * our range starts.
- */
- node = tree_search(tree, start);
- if (!node) {
- failrec = ERR_PTR(-ENOENT);
- goto out;
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (state->start != start) {
- failrec = ERR_PTR(-ENOENT);
- goto out;
- }
-
- failrec = state->failrec;
-out:
- spin_unlock(&tree->lock);
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search(&inode->io_failure_tree, start);
+ if (node)
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ spin_unlock(&inode->io_failure_lock);
return failrec;
}
-/*
- * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if every extent in the tree
- * has the bits set. Otherwise, 1 is returned if any bit in the
- * range is found set.
- */
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, int filled, struct extent_state *cached)
+static void free_io_failure(struct btrfs_inode *inode,
+ struct io_failure_record *rec)
{
- struct extent_state *state = NULL;
- struct rb_node *node;
- int bitset = 0;
-
- spin_lock(&tree->lock);
- if (cached && extent_state_in_tree(cached) && cached->start <= start &&
- cached->end > start)
- node = &cached->rb_node;
- else
- node = tree_search(tree, start);
- while (node && start <= end) {
- state = rb_entry(node, struct extent_state, rb_node);
-
- if (filled && state->start > start) {
- bitset = 0;
- break;
- }
-
- if (state->start > end)
- break;
-
- if (state->state & bits) {
- bitset = 1;
- if (!filled)
- break;
- } else if (filled) {
- bitset = 0;
- break;
- }
-
- if (state->end == (u64)-1)
- break;
-
- start = state->end + 1;
- if (start > end)
- break;
- node = rb_next(node);
- if (!node) {
- if (filled)
- bitset = 0;
- break;
- }
- }
- spin_unlock(&tree->lock);
- return bitset;
-}
-
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec)
-{
- int ret;
- int err = 0;
-
- set_state_failrec(failure_tree, rec->start, NULL);
- ret = clear_extent_bits(failure_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret)
- err = ret;
-
- ret = clear_extent_bits(io_tree, rec->start,
- rec->start + rec->len - 1,
- EXTENT_DAMAGED);
- if (ret && !err)
- err = ret;
+ spin_lock(&inode->io_failure_lock);
+ rb_erase(&rec->rb_node, &inode->io_failure_tree);
+ spin_unlock(&inode->io_failure_lock);
kfree(rec);
- return err;
}
/*
@@ -2303,12 +544,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num)
{
- struct bio *bio;
struct btrfs_device *dev;
+ struct bio_vec bvec;
+ struct bio bio;
u64 map_length = 0;
u64 sector;
struct btrfs_io_context *bioc = NULL;
- int ret;
+ int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
@@ -2316,8 +558,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
- bio = btrfs_bio_alloc(1);
- bio->bi_iter.bi_size = 0;
map_length = length;
/*
@@ -2335,52 +575,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&map_length, &bioc, 0);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
+ if (ret)
+ goto out_counter_dec;
ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
&map_length, &bioc, mirror_num);
- if (ret) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
- }
+ if (ret)
+ goto out_counter_dec;
BUG_ON(mirror_num != bioc->mirror_num);
}
sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
- bio->bi_iter.bi_sector = sector;
dev = bioc->stripes[bioc->mirror_num - 1].dev;
btrfs_put_bioc(bioc);
+
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return -EIO;
+ ret = -EIO;
+ goto out_counter_dec;
}
- bio_set_dev(bio, dev->bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
- bio_add_page(bio, page, length, pg_offset);
- if (btrfsic_submit_bio_wait(bio)) {
+ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+ bio.bi_iter.bi_sector = sector;
+ __bio_add_page(&bio, page, length, pg_offset);
+
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ if (ret) {
/* try to remap that extent elsewhere? */
- btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
- return -EIO;
+ goto out_bio_uninit;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start,
rcu_str_deref(dev->name), sector);
+ ret = 0;
+
+out_bio_uninit:
+ bio_uninit(&bio);
+out_counter_dec:
btrfs_bio_counter_dec(fs_info);
- bio_put(bio);
- return 0;
+ return ret;
}
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
@@ -2406,28 +644,36 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
return ret;
}
+static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == failrec->num_copies)
+ return cur_mirror + 1 - failrec->num_copies;
+ return cur_mirror + 1;
+}
+
+static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
+{
+ if (cur_mirror == 1)
+ return failrec->num_copies;
+ return cur_mirror - 1;
+}
+
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset)
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset)
{
- u64 private;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ u64 ino = btrfs_ino(inode);
+ u64 locked_start, locked_end;
struct io_failure_record *failrec;
- struct extent_state *state;
- int num_copies;
+ int mirror;
int ret;
- private = 0;
- ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
- EXTENT_DIRTY, 0);
- if (!ret)
- return 0;
-
- failrec = get_state_failrec(failure_tree, start);
+ failrec = get_failrec(inode, start);
if (IS_ERR(failrec))
return 0;
@@ -2436,26 +682,21 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
if (sb_rdonly(fs_info->sb))
goto out;
- spin_lock(&io_tree->lock);
- state = find_first_extent_bit_state(io_tree,
- failrec->start,
- EXTENT_LOCKED);
- spin_unlock(&io_tree->lock);
-
- if (state && state->start <= failrec->start &&
- state->end >= failrec->start + failrec->len - 1) {
- num_copies = btrfs_num_copies(fs_info, failrec->logical,
- failrec->len);
- if (num_copies > 1) {
- repair_io_failure(fs_info, ino, start, failrec->len,
- failrec->logical, page, pg_offset,
- failrec->failed_mirror);
- }
- }
+ ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
+ &locked_end, EXTENT_LOCKED, NULL);
+ if (ret || locked_start > failrec->bytenr ||
+ locked_end < failrec->bytenr + failrec->len - 1)
+ goto out;
-out:
- free_io_failure(failure_tree, io_tree, failrec);
+ mirror = failrec->this_mirror;
+ do {
+ mirror = prev_mirror(failrec, mirror);
+ repair_io_failure(fs_info, ino, start, failrec->len,
+ failrec->logical, page, pg_offset, mirror);
+ } while (mirror != failrec->failed_mirror);
+out:
+ free_io_failure(inode, failrec);
return 0;
}
@@ -2467,56 +708,50 @@ out:
*/
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct extent_io_tree *failure_tree = &inode->io_failure_tree;
struct io_failure_record *failrec;
- struct extent_state *state, *next;
+ struct rb_node *node, *next;
- if (RB_EMPTY_ROOT(&failure_tree->state))
+ if (RB_EMPTY_ROOT(&inode->io_failure_tree))
return;
- spin_lock(&failure_tree->lock);
- state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
- while (state) {
- if (state->start > end)
+ spin_lock(&inode->io_failure_lock);
+ node = rb_simple_search_first(&inode->io_failure_tree, start);
+ while (node) {
+ failrec = rb_entry(node, struct io_failure_record, rb_node);
+ if (failrec->bytenr > end)
break;
- ASSERT(state->end <= end);
-
- next = next_state(state);
-
- failrec = state->failrec;
- free_extent_state(state);
+ next = rb_next(node);
+ rb_erase(&failrec->rb_node, &inode->io_failure_tree);
kfree(failrec);
- state = next;
+ node = next;
}
- spin_unlock(&failure_tree->lock);
+ spin_unlock(&inode->io_failure_lock);
}
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
- u64 start)
+ struct btrfs_bio *bbio,
+ unsigned int bio_offset)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 start = bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
- struct extent_map *em;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
const u32 sectorsize = fs_info->sectorsize;
int ret;
- u64 logical;
- failrec = get_state_failrec(failure_tree, start);
+ failrec = get_failrec(BTRFS_I(inode), start);
if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
- failrec->logical, failrec->start, failrec->len);
+ failrec->logical, failrec->bytenr, failrec->len);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
-
+ ASSERT(failrec->this_mirror == bbio->mirror_num);
+ ASSERT(failrec->len == fs_info->sectorsize);
return failrec;
}
@@ -2524,53 +759,34 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
if (!failrec)
return ERR_PTR(-ENOMEM);
- failrec->start = start;
+ RB_CLEAR_NODE(&failrec->rb_node);
+ failrec->bytenr = start;
failrec->len = sectorsize;
- failrec->this_mirror = 0;
- failrec->bio_flags = 0;
+ failrec->failed_mirror = bbio->mirror_num;
+ failrec->this_mirror = bbio->mirror_num;
+ failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, failrec->len);
- if (!em) {
- read_unlock(&em_tree->lock);
- kfree(failrec);
- return ERR_PTR(-EIO);
- }
+ btrfs_debug(fs_info,
+ "new io failure record logical %llu start %llu",
+ failrec->logical, start);
- if (em->start > start || em->start + em->len <= start) {
- free_extent_map(em);
- em = NULL;
- }
- read_unlock(&em_tree->lock);
- if (!em) {
+ failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
+ if (failrec->num_copies == 1) {
+ /*
+ * We only have a single copy of the data, so don't bother with
+ * all the retry and error correction code that follows. No
+ * matter what the error is, it is very likely to persist.
+ */
+ btrfs_debug(fs_info,
+ "cannot repair logical %llu num_copies %d",
+ failrec->logical, failrec->num_copies);
kfree(failrec);
return ERR_PTR(-EIO);
}
- logical = start - em->start;
- logical = em->block_start + logical;
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- logical = em->block_start;
- failrec->bio_flags = EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&failrec->bio_flags, em->compress_type);
- }
-
- btrfs_debug(fs_info,
- "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
- logical, start, failrec->len);
-
- failrec->logical = logical;
- free_extent_map(em);
-
/* Set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
- EXTENT_LOCKED | EXTENT_DIRTY);
- if (ret >= 0) {
- ret = set_state_failrec(failure_tree, start, failrec);
- /* Set the bits in the inode's tree */
- ret = set_extent_bits(tree, start, start + sectorsize - 1,
- EXTENT_DAMAGED);
- } else if (ret < 0) {
+ ret = insert_failrec(BTRFS_I(inode), failrec);
+ if (ret) {
kfree(failrec);
return ERR_PTR(ret);
}
@@ -2578,65 +794,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return failrec;
}
-static bool btrfs_check_repairable(struct inode *inode,
- struct io_failure_record *failrec,
- int failed_mirror)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int num_copies;
-
- num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
- if (num_copies == 1) {
- /*
- * we only have a single copy of the data, so don't bother with
- * all the retry and error correction code that follows. no
- * matter what the error is, it is very likely to persist.
- */
- btrfs_debug(fs_info,
- "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return false;
- }
-
- /* The failure record should only contain one sector */
- ASSERT(failrec->len == fs_info->sectorsize);
-
- /*
- * There are two premises:
- * a) deliver good data to the caller
- * b) correct the bad sectors on disk
- *
- * Since we're only doing repair for one sector, we only need to get
- * a good copy of the failed sector and if we succeed, we have setup
- * everything for repair_io_failure to do the rest for us.
- */
- ASSERT(failed_mirror);
- failrec->failed_mirror = failed_mirror;
- failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
-
- if (failrec->this_mirror > num_copies) {
- btrfs_debug(fs_info,
- "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
- num_copies, failrec->this_mirror, failed_mirror);
- return false;
- }
-
- return true;
-}
-
-int btrfs_repair_one_sector(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, int failed_mirror,
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
submit_bio_hook_t *submit_bio_hook)
{
+ u64 start = failed_bbio->file_offset + bio_offset;
struct io_failure_record *failrec;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
+ struct bio *failed_bio = &failed_bbio->bio;
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
struct btrfs_bio *repair_bbio;
@@ -2646,22 +811,33 @@ int btrfs_repair_one_sector(struct inode *inode,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- failrec = btrfs_get_io_failure_record(inode, start);
+ failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
if (IS_ERR(failrec))
return PTR_ERR(failrec);
-
- if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
- free_io_failure(failure_tree, tree, failrec);
+ /*
+ * There are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ *
+ * Since we're only doing repair for one sector, we only need to get
+ * a good copy of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
+ */
+ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
+ if (failrec->this_mirror == failrec->failed_mirror) {
+ btrfs_debug(fs_info,
+ "failed to repair num_copies %d this_mirror %d failed_mirror %d",
+ failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
+ free_io_failure(BTRFS_I(inode), failrec);
return -EIO;
}
- repair_bio = btrfs_bio_alloc(1);
+ repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
+ failed_bbio->private);
repair_bbio = btrfs_bio(repair_bio);
- repair_bio->bi_opf = REQ_OP_READ;
- repair_bio->bi_end_io = failed_bio->bi_end_io;
+ repair_bbio->file_offset = start;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
- repair_bio->bi_private = failed_bio->bi_private;
if (failed_bbio->csum) {
const u32 csum_size = fs_info->csum_size;
@@ -2683,7 +859,7 @@ int btrfs_repair_one_sector(struct inode *inode,
* will be handled by the endio on the repair_bio, so we can't return an
* error here.
*/
- submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags);
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
return BLK_STS_OK;
}
@@ -2709,26 +885,44 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_page_set_error(fs_info, page, start, len);
}
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
unlock_page(page);
else
btrfs_subpage_end_reader(fs_info, page, start, len);
}
-static blk_status_t submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- unsigned int error_bitmap,
- submit_bio_hook_t *submit_bio_hook)
+static void end_sector_io(struct page *page, u64 offset, bool uptodate)
{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ const u32 sectorsize = inode->root->fs_info->sectorsize;
+ struct extent_state *cached = NULL;
+
+ end_page_read(page, uptodate, offset, sectorsize);
+ if (uptodate)
+ set_extent_uptodate(&inode->io_tree, offset,
+ offset + sectorsize - 1, &cached, GFP_ATOMIC);
+ unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1,
+ &cached);
+}
+
+static void submit_data_read_repair(struct inode *inode,
+ struct btrfs_bio *failed_bbio,
+ u32 bio_offset, const struct bio_vec *bvec,
+ unsigned int error_bitmap)
+{
+ const unsigned int pgoff = bvec->bv_offset;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct page *page = bvec->bv_page;
+ const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
+ const u64 end = start + bvec->bv_len - 1;
const u32 sectorsize = fs_info->sectorsize;
const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
- int error = 0;
int i;
- BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+ BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
+
+ /* This repair is only for data */
+ ASSERT(is_data_inode(inode));
/* We're here because we had some read errors or csum mismatch */
ASSERT(error_bitmap);
@@ -2737,12 +931,11 @@ static blk_status_t submit_read_repair(struct inode *inode,
* We only get called on buffered IO, thus page must be mapped and bio
* must not be cloned.
*/
- ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
+ ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
/* Iterate through all the sectors in the range */
for (i = 0; i < nr_bits; i++) {
const unsigned int offset = i * sectorsize;
- struct extent_state *cached = NULL;
bool uptodate = false;
int ret;
@@ -2755,10 +948,9 @@ static blk_status_t submit_read_repair(struct inode *inode,
goto next;
}
- ret = btrfs_repair_one_sector(inode, failed_bio,
- bio_offset + offset,
- page, pgoff + offset, start + offset,
- failed_mirror, submit_bio_hook);
+ ret = btrfs_repair_one_sector(inode, failed_bbio,
+ bio_offset + offset, page, pgoff + offset,
+ btrfs_submit_data_read_bio);
if (!ret) {
/*
* We have submitted the read repair, the page release
@@ -2769,24 +961,12 @@ static blk_status_t submit_read_repair(struct inode *inode,
continue;
}
/*
- * Repair failed, just record the error but still continue.
- * Or the remaining sectors will not be properly unlocked.
+ * Continue on failed repair, otherwise the remaining sectors
+ * will not be properly unlocked.
*/
- if (!error)
- error = ret;
next:
- end_page_read(page, uptodate, start + offset, sectorsize);
- if (uptodate)
- set_extent_uptodate(&BTRFS_I(inode)->io_tree,
- start + offset,
- start + offset + sectorsize - 1,
- &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
- start + offset,
- start + offset + sectorsize - 1,
- &cached);
+ end_sector_io(page, start + offset, uptodate);
}
- return errno_to_blk_status(error);
}
/* lots and lots of room for performance fixes in the end_bio funcs */
@@ -2824,8 +1004,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio)
+static void end_bio_extent_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
@@ -2925,11 +1106,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
* Now we don't have range contiguous to the processed range, release
* the processed range now.
*/
- if (processed->uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, processed->start, processed->end,
- &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, processed->start, processed->end,
- &cached);
+ unlock_extent_atomic(tree, processed->start, processed->end, &cached);
update:
/* Update processed to current range */
@@ -2942,7 +1119,7 @@ update:
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
ASSERT(PageLocked(page));
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page));
@@ -2964,7 +1141,7 @@ static struct extent_buffer *find_extent_buffer_readpage(
* For regular sectorsize, we can use page->private to grab extent
* buffer
*/
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
ASSERT(PagePrivate(page) && page->private);
return (struct extent_buffer *)page->private;
}
@@ -2989,11 +1166,10 @@ static struct extent_buffer *find_extent_buffer_readpage(
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio)
+static void end_bio_extent_readpage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
- struct btrfs_bio *bbio = btrfs_bio(bio);
- struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
* The offset to the beginning of a bio, since one bio can never be
@@ -3001,7 +1177,6 @@ static void end_bio_extent_readpage(struct bio *bio)
*/
u32 bio_offset = 0;
int mirror;
- int ret;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
@@ -3012,6 +1187,7 @@ static void end_bio_extent_readpage(struct bio *bio)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
unsigned int error_bitmap = (unsigned int)-1;
+ bool repair = false;
u64 start;
u64 end;
u32 len;
@@ -3020,8 +1196,6 @@ static void end_bio_extent_readpage(struct bio *bio)
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
- tree = &BTRFS_I(inode)->io_tree;
- failure_tree = &BTRFS_I(inode)->io_failure_tree;
/*
* We always issue full-sector reads, but if some block in a
@@ -3049,57 +1223,21 @@ static void end_bio_extent_readpage(struct bio *bio)
if (is_data_inode(inode)) {
error_bitmap = btrfs_verify_data_csum(bbio,
bio_offset, page, start, end);
- ret = error_bitmap;
+ if (error_bitmap)
+ uptodate = false;
} else {
- ret = btrfs_validate_metadata_buffer(bbio,
- page, start, end, mirror);
+ if (btrfs_validate_metadata_buffer(bbio,
+ page, start, end, mirror))
+ uptodate = false;
}
- if (ret)
- uptodate = false;
- else
- clean_io_failure(BTRFS_I(inode)->root->fs_info,
- failure_tree, tree, start,
- page,
- btrfs_ino(BTRFS_I(inode)), 0);
}
- if (likely(uptodate))
- goto readpage_ok;
-
- if (is_data_inode(inode)) {
- /*
- * If we failed to submit the IO at all we'll have a
- * mirror_num == 0, in which case we need to just mark
- * the page with an error and unlock it and carry on.
- */
- if (mirror == 0)
- goto readpage_ok;
-
- /*
- * btrfs_submit_read_repair() will handle all the good
- * and bad sectors, we just continue to the next bvec.
- */
- submit_read_repair(inode, bio, bio_offset, page,
- start - page_offset(page), start,
- end, mirror, error_bitmap,
- btrfs_submit_data_bio);
-
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
- continue;
- } else {
- struct extent_buffer *eb;
-
- eb = find_extent_buffer_readpage(fs_info, page, start);
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- eb->read_mirror = mirror;
- atomic_dec(&eb->io_pages);
- }
-readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
+ btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
+
/*
* Zero out the remaining part if this range straddles
* i_size.
@@ -3116,14 +1254,44 @@ readpage_ok:
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);
}
+ } else if (is_data_inode(inode)) {
+ /*
+ * Only try to repair bios that actually made it to a
+ * device. If the bio failed to be submitted mirror
+ * is 0 and we need to fail it without retrying.
+ *
+ * This also includes the high level bios for compressed
+ * extents - these never make it to a device and repair
+ * is already handled on the lower compressed bio.
+ */
+ if (mirror > 0)
+ repair = true;
+ } else {
+ struct extent_buffer *eb;
+
+ eb = find_extent_buffer_readpage(fs_info, page, start);
+ set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = mirror;
+ atomic_dec(&eb->io_pages);
+ }
+
+ if (repair) {
+ /*
+ * submit_data_read_repair() will handle all the good
+ * and bad sectors, we just continue to the next bvec.
+ */
+ submit_data_read_repair(inode, bbio, bio_offset, bvec,
+ error_bitmap);
+ } else {
+ /* Update page status and unlock */
+ end_page_read(page, uptodate, start, len);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, PageUptodate(page));
}
+
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
- /* Update page status and unlock */
- end_page_read(page, uptodate, start, len);
- endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, PageUptodate(page));
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
@@ -3131,61 +1299,40 @@ readpage_ok:
bio_put(bio);
}
-/*
- * Initialize the members up to but not including 'bio'. Use after allocating a
- * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
- * 'bio' because use of __GFP_ZERO is not supported.
- */
-static inline void btrfs_bio_init(struct btrfs_bio *bbio)
-{
- memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-}
-
-/*
- * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+/**
+ * Populate every free slot in a provided array with pages.
+ *
+ * @nr_pages: number of pages to allocate
+ * @page_array: the array to fill with pages; any existing non-null entries in
+ * the array will be skipped
*
- * The bio allocation is backed by bioset and does not fail.
+ * Return: 0 if all pages were able to be allocated;
+ * -ENOMEM otherwise, and the caller is responsible for freeing all
+ * non-null page pointers in the array.
*/
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
- bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
- btrfs_bio_init(btrfs_bio(bio));
- return bio;
-}
-
-struct bio *btrfs_bio_clone(struct bio *bio)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
{
- struct btrfs_bio *bbio;
- struct bio *new;
-
- /* Bio allocation backed by a bioset does not fail */
- new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset);
- bbio = btrfs_bio(new);
- btrfs_bio_init(bbio);
- bbio->iter = bio->bi_iter;
- return new;
-}
+ unsigned int allocated;
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
-{
- struct bio *bio;
- struct btrfs_bio *bbio;
+ for (allocated = 0; allocated < nr_pages;) {
+ unsigned int last = allocated;
- ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+ allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
- /* this will never fail when it's backed by a bioset */
- bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
- ASSERT(bio);
+ if (allocated == nr_pages)
+ return 0;
- bbio = btrfs_bio(bio);
- btrfs_bio_init(bbio);
+ /*
+ * During this iteration, no page could be allocated, even
+ * though alloc_pages_bulk_array() falls back to alloc_page()
+ * if it could not bulk-allocate. So we must be out of memory.
+ */
+ if (allocated == last)
+ return -ENOMEM;
- bio_trim(bio, offset >> 9, size >> 9);
- bbio->iter = bio->bi_iter;
- return bio;
+ memalloc_retry_wait(GFP_NOFS);
+ }
+ return 0;
}
/**
@@ -3197,7 +1344,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
* a contiguous page to the previous one
* @size: portion of page that we want to write
* @pg_offset: starting offset in the page
- * @bio_flags: flags of the current bio to see if we can merge them
+ * @compress_type: compression type of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
*
@@ -3209,25 +1356,50 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
struct page *page,
u64 disk_bytenr, unsigned int size,
unsigned int pg_offset,
- unsigned long bio_flags)
+ enum btrfs_compression_type compress_type)
{
struct bio *bio = bio_ctrl->bio;
u32 bio_size = bio->bi_iter.bi_size;
u32 real_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
- bool contig;
+ bool contig = false;
int ret;
ASSERT(bio);
/* The limit should be calculated when bio_ctrl->bio is allocated */
ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
- if (bio_ctrl->bio_flags != bio_flags)
+ if (bio_ctrl->compress_type != compress_type)
return 0;
- if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
+
+ if (bio->bi_iter.bi_size == 0) {
+ /* We can always add a page into an empty bio. */
+ contig = true;
+ } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
+ struct bio_vec *bvec = bio_last_bvec_all(bio);
+
+ /*
+ * The contig check requires the following conditions to be met:
+ * 1) The pages are belonging to the same inode
+ * This is implied by the call chain.
+ *
+ * 2) The range has adjacent logical bytenr
+ *
+ * 3) The range has adjacent file offset
+ * This is required for the usage of btrfs_bio->file_offset.
+ */
+ if (bio_end_sector(bio) == sector &&
+ page_offset(bvec->bv_page) + bvec->bv_offset +
+ bvec->bv_len == page_offset(page) + pg_offset)
+ contig = true;
+ } else {
+ /*
+ * For compression, all IO should have its logical bytenr
+ * set to the starting bytenr of the compressed extent.
+ */
contig = bio->bi_iter.bi_sector == sector;
- else
- contig = bio_end_sector(bio) == sector;
+ }
+
if (!contig)
return 0;
@@ -3267,7 +1439,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
* The split happens for real compressed bio, which happens in
* btrfs_submit_compressed_read/write().
*/
- if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
+ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
bio_ctrl->len_to_stripe_boundary = U32_MAX;
return 0;
@@ -3307,81 +1479,90 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
static int alloc_new_bio(struct btrfs_inode *inode,
struct btrfs_bio_ctrl *bio_ctrl,
struct writeback_control *wbc,
- unsigned int opf,
- bio_end_io_t end_io_func,
+ blk_opf_t opf,
u64 disk_bytenr, u32 offset, u64 file_offset,
- unsigned long bio_flags)
+ enum btrfs_compression_type compress_type)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
int ret;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ ASSERT(bio_ctrl->end_io_func);
+
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
- if (bio_flags & EXTENT_BIO_COMPRESSED)
+ if (compress_type != BTRFS_COMPRESS_NONE)
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
else
bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
bio_ctrl->bio = bio;
- bio_ctrl->bio_flags = bio_flags;
- bio->bi_end_io = end_io_func;
- bio->bi_private = &inode->io_tree;
- bio->bi_opf = opf;
+ bio_ctrl->compress_type = compress_type;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
- if (wbc) {
- struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_dev->bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- }
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *device;
+ if (wbc) {
+ /*
+ * For Zone append we need the correct block_device that we are
+ * going to write to set in the bio to be able to respect the
+ * hardware limitation. Look it up here:
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto error;
+ }
- device = btrfs_zoned_get_device(fs_info, disk_bytenr,
- fs_info->sectorsize);
- if (IS_ERR(device)) {
- ret = PTR_ERR(device);
- goto error;
+ bio_set_dev(bio, dev->bdev);
+ } else {
+ /*
+ * Otherwise pick the last added device to support
+ * cgroup writeback. For multi-device file systems this
+ * means blk-cgroup policies have to always be set on the
+ * last added/replaced device. This is a bit odd but has
+ * been like that for a long time.
+ */
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
}
-
- btrfs_bio(bio)->device = device;
+ wbc_init_bio(wbc, bio);
+ } else {
+ ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
}
return 0;
error:
bio_ctrl->bio = NULL;
- bio->bi_status = errno_to_blk_status(ret);
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
return ret;
}
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
- * @page: page to add to the bio
* @disk_bytenr: logical bytenr where the write will be
+ * @page: page to add to the bio
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @bio_ret: must be valid pointer, newly allocated bio will be stored there
- * @end_io_func: end_io callback for new bio
- * @mirror_num: desired mirror to read/write
- * @prev_bio_flags: flags of previous bio to see if we can merge the current one
- * @bio_flags: flags of the current bio to see if we can merge them
+ * @compress_type: compress type for current bio
+ *
+ * The will either add the page into the existing @bio_ctrl->bio, or allocate a
+ * new one in @bio_ctrl->bio.
+ * The mirror number for this IO should already be initizlied in
+ * @bio_ctrl->mirror_num.
*/
-static int submit_extent_page(unsigned int opf,
+static int submit_extent_page(blk_opf_t opf,
struct writeback_control *wbc,
struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page, u64 disk_bytenr,
+ u64 disk_bytenr, struct page *page,
size_t size, unsigned long pg_offset,
- bio_end_io_t end_io_func,
- int mirror_num,
- unsigned long bio_flags,
+ enum btrfs_compression_type compress_type,
bool force_bio_submit)
{
int ret = 0;
@@ -3392,12 +1573,11 @@ static int submit_extent_page(unsigned int opf,
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
- if (force_bio_submit && bio_ctrl->bio) {
- ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
- bio_ctrl->bio = NULL;
- if (ret < 0)
- return ret;
- }
+
+ ASSERT(bio_ctrl->end_io_func);
+
+ if (force_bio_submit)
+ submit_one_bio(bio_ctrl);
while (cur < pg_offset + size) {
u32 offset = cur - pg_offset;
@@ -3406,9 +1586,9 @@ static int submit_extent_page(unsigned int opf,
/* Allocate new bio if needed */
if (!bio_ctrl->bio) {
ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
- end_io_func, disk_bytenr, offset,
+ disk_bytenr, offset,
page_offset(page) + cur,
- bio_flags);
+ compress_type);
if (ret < 0)
return ret;
}
@@ -3416,14 +1596,14 @@ static int submit_extent_page(unsigned int opf,
* We must go through btrfs_bio_add_page() to ensure each
* page range won't cross various boundaries.
*/
- if (bio_flags & EXTENT_BIO_COMPRESSED)
+ if (compress_type != BTRFS_COMPRESS_NONE)
added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
size - offset, pg_offset + offset,
- bio_flags);
+ compress_type);
else
added = btrfs_bio_add_page(bio_ctrl, page,
disk_bytenr + offset, size - offset,
- pg_offset + offset, bio_flags);
+ pg_offset + offset, compress_type);
/* Metadata page range should never be split */
if (!is_data_inode(&inode->vfs_inode))
@@ -3437,11 +1617,7 @@ static int submit_extent_page(unsigned int opf,
if (added < size - offset) {
/* The bio should contain some page(s) */
ASSERT(bio_ctrl->bio->bi_iter.bi_size);
- ret = submit_one_bio(bio_ctrl->bio, mirror_num,
- bio_ctrl->bio_flags);
- bio_ctrl->bio = NULL;
- if (ret < 0)
- return ret;
+ submit_one_bio(bio_ctrl);
}
cur += added;
}
@@ -3464,7 +1640,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
if (page->mapping)
lockdep_assert_held(&page->mapping->private_lock);
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
if (!PagePrivate(page))
attach_page_private(page, eb);
else
@@ -3499,7 +1675,7 @@ int set_page_extent_mapped(struct page *page)
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (btrfs_is_subpage(fs_info, page))
return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
@@ -3516,7 +1692,7 @@ void clear_page_extent_mapped(struct page *page)
return;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (btrfs_is_subpage(fs_info, page))
return btrfs_detach_subpage(fs_info, page);
detach_page_private(page);
@@ -3555,9 +1731,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
- unsigned int read_flags, u64 *prev_em_start)
+ blk_opf_t read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3567,7 +1743,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 extent_offset;
u64 last_byte = i_size_read(inode);
u64 block_start;
- u64 cur_end;
struct extent_map *em;
int ret = 0;
size_t pg_offset = 0;
@@ -3577,7 +1752,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ret = set_page_extent_mapped(page);
if (ret < 0) {
- unlock_extent(tree, start, end);
+ unlock_extent(tree, start, end, NULL);
btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
unlock_page(page);
goto out;
@@ -3589,9 +1764,9 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, iosize);
- flush_dcache_page(page);
}
}
+ bio_ctrl->end_io_func = end_bio_extent_readpage;
begin_page_read(fs_info, page);
while (cur <= end) {
unsigned long this_bio_flag = 0;
@@ -3604,18 +1779,16 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = PAGE_SIZE - pg_offset;
memzero_page(page, pg_offset, iosize);
- flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
if (IS_ERR(em)) {
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end, NULL);
end_page_read(page, false, cur, end + 1 - cur);
ret = PTR_ERR(em);
break;
@@ -3624,16 +1797,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
- this_bio_flag |= EXTENT_BIO_COMPRESSED;
- extent_set_compress_type(&this_bio_flag,
- em->compress_type);
- }
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ this_bio_flag = em->compress_type;
iosize = min(extent_map_end(em) - cur, end - cur + 1);
- cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED)
+ if (this_bio_flag != BTRFS_COMPRESS_NONE)
disk_bytenr = em->block_start;
else
disk_bytenr = em->block_start + extent_offset;
@@ -3691,46 +1860,35 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct extent_state *cached = NULL;
memzero_page(page, pg_offset, iosize);
- flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur,
- cur + iosize - 1, &cached);
+ unlock_extent(tree, cur, cur + iosize - 1, &cached);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
/* the get_extent function already copied into the page */
- if (test_range_bit(tree, cur, cur_end,
- EXTENT_UPTODATE, 1, NULL)) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
- continue;
- }
- /* we have an inline extent but it didn't get marked up
- * to date. Error out
- */
if (block_start == EXTENT_MAP_INLINE) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, false, cur, iosize);
+ unlock_extent(tree, cur, cur + iosize - 1, NULL);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- bio_ctrl, page, disk_bytenr, iosize,
- pg_offset,
- end_bio_extent_readpage, 0,
- this_bio_flag,
+ bio_ctrl, disk_bytenr, page, iosize,
+ pg_offset, this_bio_flag,
force_bio_submit);
if (ret) {
- unlock_extent(tree, cur, cur + iosize - 1);
- end_page_read(page, false, cur, iosize);
+ /*
+ * We have to unlock the remaining range, or the page
+ * will never be unlocked.
+ */
+ unlock_extent(tree, cur, end, NULL);
+ end_page_read(page, false, cur, end + 1 - cur);
goto out;
}
cur = cur + iosize;
@@ -3740,6 +1898,26 @@ out:
return ret;
}
+int btrfs_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ /*
+ * If btrfs_do_readpage() failed we will want to submit the assembled
+ * bio to do the cleanup.
+ */
+ submit_one_bio(&bio_ctrl);
+ return ret;
+}
+
static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
@@ -3758,12 +1936,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
}
}
-static void update_nr_written(struct writeback_control *wbc,
- unsigned long nr_written)
-{
- wbc->nr_to_write -= nr_written;
-}
-
/*
* helper for __extent_writepage, doing all of the delayed allocation setup.
*
@@ -3863,7 +2035,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (!btrfs_is_subpage(fs_info, page)) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
@@ -3906,10 +2078,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
u64 extent_offset;
u64 block_start;
struct extent_map *em;
+ int saved_ret = 0;
int ret = 0;
int nr = 0;
- u32 opf = REQ_OP_WRITE;
- const unsigned int write_flags = wbc_to_write_flags(wbc);
+ enum req_op op = REQ_OP_WRITE;
+ const blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ bool has_error = false;
bool compressed;
ret = btrfs_writepage_cow_fixup(page);
@@ -3924,8 +2098,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
+ epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
while (cur <= end) {
u64 disk_bytenr;
u64 em_end;
@@ -3959,6 +2134,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (IS_ERR(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
break;
}
@@ -3979,7 +2157,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
if (btrfs_use_zone_append(inode, em->block_start))
- opf = REQ_OP_ZONE_APPEND;
+ op = REQ_OP_ZONE_APPEND;
free_extent_map(em);
em = NULL;
@@ -4015,13 +2193,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
- ret = submit_extent_page(opf | write_flags, wbc,
- &epd->bio_ctrl, page,
- disk_bytenr, iosize,
+ ret = submit_extent_page(op | write_flags, wbc,
+ &epd->bio_ctrl, disk_bytenr,
+ page, iosize,
cur - page_offset(page),
- end_bio_extent_writepage,
- 0, 0, false);
+ 0, false);
if (ret) {
+ has_error = true;
+ if (!saved_ret)
+ saved_ret = ret;
+
btrfs_page_set_error(fs_info, page, cur, iosize);
if (PageWriteback(page))
btrfs_page_clear_writeback(fs_info, page, cur,
@@ -4035,8 +2216,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* If we finish without problem, we should not only clear page dirty,
* but also empty subpage dirty bits
*/
- if (!ret)
+ if (!has_error)
btrfs_page_assert_not_dirty(fs_info, page);
+ else
+ ret = saved_ret;
*nr_ret = nr;
return ret;
}
@@ -4079,10 +2262,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- if (page->index == end_index) {
+ if (page->index == end_index)
memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
- flush_dcache_page(page);
- }
ret = set_page_extent_mapped(page);
if (ret < 0) {
@@ -4167,9 +2348,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
- if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
- btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
-
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4189,14 +2367,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages, failed_page_nr;
+ int i, num_pages;
int flush = 0;
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ submit_write_bio(epd, 0);
flush = 1;
btrfs_tree_lock(eb);
}
@@ -4206,9 +2382,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!epd->sync_io)
return 0;
if (!flush) {
- ret = flush_write_bio(epd);
- if (ret < 0)
- return ret;
+ submit_write_bio(epd, 0);
flush = 1;
}
while (1) {
@@ -4246,7 +2420,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
* Subpage metadata doesn't use page locking at all, so we can skip
* the page locking.
*/
- if (!ret || fs_info->sectorsize < PAGE_SIZE)
+ if (!ret || fs_info->nodesize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb);
@@ -4255,14 +2429,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
if (!trylock_page(p)) {
if (!flush) {
- int err;
-
- err = flush_write_bio(epd);
- if (err < 0) {
- ret = err;
- failed_page_nr = i;
- goto err_unlock;
- }
+ submit_write_bio(epd, 0);
flush = 1;
}
lock_page(p);
@@ -4270,25 +2437,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
}
return ret;
-err_unlock:
- /* Unlock already locked pages */
- for (i = 0; i < failed_page_nr; i++)
- unlock_page(eb->pages[i]);
- /*
- * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
- * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
- * be made and undo everything done before.
- */
- btrfs_tree_lock(eb);
- spin_lock(&eb->refs_lock);
- set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- end_extent_buffer_writeback(eb);
- spin_unlock(&eb->refs_lock);
- percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
- fs_info->dirty_metadata_batch);
- btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
- btrfs_tree_unlock(eb);
- return ret;
}
static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
@@ -4399,14 +2547,15 @@ static struct extent_buffer *find_extent_buffer_nolock(
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
-static void end_bio_subpage_eb_writepage(struct bio *bio)
+static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
- ASSERT(fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(fs_info->nodesize < PAGE_SIZE);
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
@@ -4456,8 +2605,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio)
bio_put(bio);
}
-static void end_bio_extent_buffer_writepage(struct bio *bio)
+static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
{
+ struct bio *bio = &bbio->bio;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
@@ -4523,7 +2673,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
- unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
bool no_dirty_ebs = false;
int ret;
@@ -4539,10 +2689,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
+ epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
+
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, page, eb->start, eb->len,
- eb->start - page_offset(page),
- end_bio_subpage_eb_writepage, 0, 0, false);
+ &epd->bio_ctrl, eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
@@ -4558,7 +2709,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
* dirty anymore, we have submitted a page. Update nr_written in wbc.
*/
if (no_dirty_ebs)
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
return ret;
}
@@ -4568,11 +2719,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
{
u64 disk_bytenr = eb->start;
int i, num_pages;
- unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
int ret = 0;
prepare_eb_write(eb);
+ epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@@ -4580,10 +2733,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- &epd->bio_ctrl, p, disk_bytenr,
- PAGE_SIZE, 0,
- end_bio_extent_buffer_writepage,
- 0, 0, false);
+ &epd->bio_ctrl, disk_bytenr, p,
+ PAGE_SIZE, 0, 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
@@ -4594,7 +2745,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
break;
}
disk_bytenr += PAGE_SIZE;
- update_nr_written(wbc, 1);
+ wbc->nr_to_write--;
unlock_page(p);
}
@@ -4697,7 +2848,7 @@ static int submit_eb_subpage(struct page *page,
cleanup:
/* We hit error, end bio for the submitted extent buffers */
- end_write_bio(epd, ret);
+ submit_write_bio(epd, ret);
return ret;
}
@@ -4733,7 +2884,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!PagePrivate(page))
return 0;
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return submit_eb_subpage(page, wbc, epd);
spin_lock(&mapping->private_lock);
@@ -4789,8 +2940,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
/*
* Implies write in zoned mode. Mark the last eb in a block group.
*/
- if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
- set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ btrfs_schedule_zone_finish_bg(cache, eb);
btrfs_put_block_group(cache);
}
ret = write_one_eb(eb, wbc, epd);
@@ -4877,10 +3027,6 @@ retry:
index = 0;
goto retry;
}
- if (ret < 0) {
- end_write_bio(&epd, ret);
- goto out;
- }
/*
* If something went wrong, don't allow any metadata write bio to be
* submitted.
@@ -4907,14 +3053,16 @@ retry:
* Now such dirty tree block will not be cleaned by any dirty
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
+ *
+ * We can get ret > 0 from submit_extent_page() indicating how many ebs
+ * were submitted. Reset it to 0 to avoid false alerts for the caller.
*/
- if (!BTRFS_FS_ERROR(fs_info)) {
- ret = flush_write_bio(&epd);
- } else {
+ if (ret > 0)
+ ret = 0;
+ if (!ret && BTRFS_FS_ERROR(fs_info))
ret = -EROFS;
- end_write_bio(&epd, ret);
- }
-out:
+ submit_write_bio(&epd, ret);
+
btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
@@ -5017,8 +3165,7 @@ retry:
* tmpfs file mapping
*/
if (!trylock_page(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
+ submit_write_bio(epd, 0);
lock_page(page);
}
@@ -5028,10 +3175,8 @@ retry:
}
if (wbc->sync_mode != WB_SYNC_NONE) {
- if (PageWriteback(page)) {
- ret = flush_write_bio(epd);
- BUG_ON(ret < 0);
- }
+ if (PageWriteback(page))
+ submit_write_bio(epd, 0);
wait_on_page_writeback(page);
}
@@ -5071,9 +3216,8 @@ retry:
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
- ret = flush_write_bio(epd);
- if (!ret)
- goto retry;
+ submit_write_bio(epd, 0);
+ goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
@@ -5083,27 +3227,6 @@ retry:
return ret;
}
-int extent_write_full_page(struct page *page, struct writeback_control *wbc)
-{
- int ret;
- struct extent_page_data epd = {
- .bio_ctrl = { 0 },
- .extent_locked = 0,
- .sync_io = wbc->sync_mode == WB_SYNC_ALL,
- };
-
- ret = __extent_writepage(page, wbc, &epd);
- ASSERT(ret <= 0);
- if (ret < 0) {
- end_write_bio(&epd, ret);
- return ret;
- }
-
- ret = flush_write_bio(&epd);
- ASSERT(ret <= 0);
- return ret;
-}
-
/*
* Submit the pages in the range to bio for call sites which delalloc range has
* already been ran (aka, ordered extent inserted) and all pages are still
@@ -5161,10 +3284,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
cur = cur_end + 1;
}
- if (!found_error)
- ret = flush_write_bio(&epd);
- else
- end_write_bio(&epd, ret);
+ submit_write_bio(&epd, found_error ? ret : 0);
wbc_detach_inode(&wbc_writepages);
if (found_error)
@@ -5189,13 +3309,8 @@ int extent_writepages(struct address_space *mapping,
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
+ submit_write_bio(&epd, ret);
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
- ASSERT(ret <= 0);
- if (ret < 0) {
- end_write_bio(&epd, ret);
- return ret;
- }
- ret = flush_write_bio(&epd);
return ret;
}
@@ -5217,11 +3332,7 @@ void extent_readahead(struct readahead_control *rac)
if (em_cached)
free_extent_map(em_cached);
-
- if (bio_ctrl.bio) {
- if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
- return;
- }
+ submit_one_bio(&bio_ctrl);
}
/*
@@ -5244,7 +3355,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, &cached_state);
+ lock_extent(tree, start, end, &cached_state);
folio_wait_writeback(folio);
/*
@@ -5252,12 +3363,12 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* so here we only need to unlock the extent range to free any
* existing extent state.
*/
- unlock_extent_cached(tree, start, end, &cached_state);
+ unlock_extent(tree, start, end, &cached_state);
return 0;
}
/*
- * a helper for releasepage, this tests for areas of the page that
+ * a helper for release_folio, this tests for areas of the page that
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
@@ -5271,15 +3382,17 @@ static int try_release_extent_state(struct extent_io_tree *tree,
if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
ret = 0;
} else {
+ u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
+ EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+
/*
* At this point we can safely clear everything except the
* locked bit, the nodatasum bit and the delalloc new bit.
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
- ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
- 0, 0, NULL, mask, NULL);
+ ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
+ mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -5293,7 +3406,7 @@ static int try_release_extent_state(struct extent_io_tree *tree,
}
/*
- * a helper for releasepage. As long as there are no locked extents
+ * a helper for release_folio. As long as there are no locked extents
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
@@ -5378,42 +3491,6 @@ next:
}
/*
- * helper function for fiemap, which doesn't want to see any holes.
- * This maps until we find something past 'last'
- */
-static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
- u64 offset, u64 last)
-{
- u64 sectorsize = btrfs_inode_sectorsize(inode);
- struct extent_map *em;
- u64 len;
-
- if (offset >= last)
- return NULL;
-
- while (1) {
- len = last - offset;
- if (len == 0)
- break;
- len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(inode, offset, len);
- if (IS_ERR(em))
- return em;
-
- /* if this isn't a hole return it */
- if (em->block_start != EXTENT_MAP_HOLE)
- return em;
-
- /* this is a hole, advance to the next extent */
- offset = extent_map_end(em);
- free_extent_map(em);
- if (offset >= last)
- break;
- }
- return NULL;
-}
-
-/*
* To cache previous fiemap extent
*
* Will be used for merging fiemap extent
@@ -5442,6 +3519,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
{
int ret = 0;
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
if (!cache->cached)
goto assign;
@@ -5465,16 +3545,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* So truly compressed (physical size smaller than logical size)
* extents won't get merged with each other
*
- * 3) Share same flags except FIEMAP_EXTENT_LAST
- * So regular extent won't get merged with prealloc extent
+ * 3) Share same flags
*/
if (cache->offset + cache->len == offset &&
cache->phys + cache->len == phys &&
- (cache->flags & ~FIEMAP_EXTENT_LAST) ==
- (flags & ~FIEMAP_EXTENT_LAST)) {
+ cache->flags == flags) {
cache->len += len;
- cache->flags |= flags;
- goto try_submit_last;
+ return 0;
}
/* Not mergeable, need to submit cached one */
@@ -5489,13 +3566,8 @@ assign:
cache->phys = phys;
cache->len = len;
cache->flags = flags;
-try_submit_last:
- if (cache->flags & FIEMAP_EXTENT_LAST) {
- ret = fiemap_fill_next_extent(fieinfo, cache->offset,
- cache->phys, cache->len, cache->flags);
- cache->cached = false;
- }
- return ret;
+
+ return 0;
}
/*
@@ -5525,215 +3597,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{
- int ret = 0;
- u64 off;
- u64 max = start + len;
- u32 flags = 0;
- u32 found_type;
- u64 last;
- u64 last_for_get_extent = 0;
- u64 disko = 0;
- u64 isize = i_size_read(&inode->vfs_inode);
- struct btrfs_key found_key;
- struct extent_map *em = NULL;
- struct extent_state *cached_state = NULL;
- struct btrfs_path *path;
- struct btrfs_root *root = inode->root;
- struct fiemap_cache cache = { 0 };
- struct ulist *roots;
- struct ulist *tmp_ulist;
- int end = 0;
- u64 em_start = 0;
- u64 em_len = 0;
- u64 em_end = 0;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
- if (len == 0)
- return -EINVAL;
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
- path = btrfs_alloc_path();
- if (!path)
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ return ret;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
return -ENOMEM;
- roots = ulist_alloc(GFP_KERNEL);
- tmp_ulist = ulist_alloc(GFP_KERNEL);
- if (!roots || !tmp_ulist) {
- ret = -ENOMEM;
- goto out_free_ulist;
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
}
/*
- * We can't initialize that to 'start' as this could miss extents due
- * to extent item merging
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we call fiemap_fill_next_extent(), because that may cause a page
+ * fault when filling the user space buffer with fiemap data.
*/
- off = 0;
- start = round_down(start, btrfs_inode_sectorsize(inode));
- len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct btrfs_backref_shared_cache *backref_cache,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ struct ulist *roots, struct ulist *tmp_ulist,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
/*
- * lookup the last file extent. We're not using i_size here
- * because there might be preallocation past i_size
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
*/
- ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
- 0);
- if (ret < 0) {
- goto out_free_ulist;
- } else {
- WARN_ON(!ret);
- if (ret == 1)
- ret = 0;
- }
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
- path->slots[0]--;
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
- found_type = found_key.type;
-
- /* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(inode) ||
- found_type != BTRFS_EXTENT_DATA_KEY) {
- /* have to trust i_size as the end */
- last = (u64)-1;
- last_for_get_extent = isize;
- } else {
/*
- * remember the start of the last extent. There are a
- * bunch of different factors that go into the length of the
- * extent, so its much less complex to remember where it started
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
*/
- last = found_key.offset;
- last_for_get_extent = last + 1;
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
}
- btrfs_release_path(path);
/*
- * we might have some extents allocated but more delalloc past those
- * extents. so, we trust isize unless the start of the last extent is
- * beyond isize
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
*/
- if (last < isize) {
- last = (u64)-1;
- last_for_get_extent = isize;
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode->root,
+ ino, disk_bytenr,
+ extent_gen, roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
}
- lock_extent_bits(&inode->io_tree, start, start + len - 1,
- &cached_state);
+ return 0;
+}
- em = get_extent_skip_holes(inode, start, last_for_get_extent);
- if (!em)
- goto out;
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
+ */
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
+
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_root *root = inode->root;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_shared_cache *backref_cache;
+ struct ulist *roots;
+ struct ulist *tmp_ulist;
+ u64 last_extent_end;
+ u64 prev_extent_end;
+ u64 lockstart;
+ u64 lockend;
+ bool stopped = false;
+ int ret;
+
+ backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+ path = btrfs_alloc_path();
+ roots = ulist_alloc(GFP_KERNEL);
+ tmp_ulist = ulist_alloc(GFP_KERNEL);
+ if (!backref_cache || !path || !roots || !tmp_ulist) {
+ ret = -ENOMEM;
goto out;
}
- while (!end) {
- u64 offset_in_extent = 0;
+ lockstart = round_down(start, root->fs_info->sectorsize);
+ lockend = round_up(start + len, root->fs_info->sectorsize);
+ prev_extent_end = lockstart;
- /* break if the extent we found is outside the range */
- if (em->start >= max || extent_map_end(em) < off)
- break;
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- /*
- * get_extent may return an extent that starts before our
- * requested range. We have to make sure the ranges
- * we return to fiemap always move forward and don't
- * overlap, so adjust the offsets here
- */
- em_start = max(em->start, off);
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, lockstart);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
/*
- * record the offset from the start of the extent
- * for adjusting the disk offset below. Only do this if the
- * extent isn't compressed since our in ram offset may be past
- * what we have actually allocated on disk.
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
*/
- if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- offset_in_extent = em_start - em->start;
- em_end = extent_map_end(em);
- em_len = em_end - em_start;
- flags = 0;
- if (em->block_start < EXTENT_MAP_LAST_BYTE)
- disko = em->block_start + offset_in_extent;
- else
- disko = 0;
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < lockend) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
/*
- * bump off for our next call to get_extent
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
*/
- off = extent_map_end(em);
- if (off >= max)
- end = 1;
-
- if (em->block_start == EXTENT_MAP_LAST_BYTE) {
- end = 1;
- flags |= FIEMAP_EXTENT_LAST;
- } else if (em->block_start == EXTENT_MAP_INLINE) {
- flags |= (FIEMAP_EXTENT_DATA_INLINE |
- FIEMAP_EXTENT_NOT_ALIGNED);
- } else if (em->block_start == EXTENT_MAP_DELALLOC) {
- flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- } else if (fieinfo->fi_extents_max) {
- u64 bytenr = em->block_start -
- (em->start - em->orig_start);
+ if (extent_end <= lockstart)
+ goto next_item;
- /*
- * As btrfs supports shared space, this information
- * can be exported to userspace tools via
- * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
- * then we're just getting a count and we can skip the
- * lookup stuff.
- */
- ret = btrfs_check_shared(root, btrfs_ino(inode),
- bytenr, roots, tmp_ulist);
- if (ret < 0)
- goto out_free;
- if (ret)
- flags |= FIEMAP_EXTENT_SHARED;
- ret = 0;
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 range_end = min(key.offset, lockend) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ prev_extent_end, range_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= lockend) {
+ stopped = true;
+ break;
+ }
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
flags |= FIEMAP_EXTENT_ENCODED;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
- free_extent_map(em);
- em = NULL;
- if ((em_start >= last) || em_len == (u64)-1 ||
- (last == (u64)-1 && isize <= em_end)) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache,
+ disk_bytenr, extent_offset,
+ extent_gen, roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ backref_cache, 0, 0, 0,
+ roots, tmp_ulist,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(root, ino,
+ disk_bytenr,
+ extent_gen,
+ roots,
+ tmp_ulist,
+ backref_cache);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
}
- /* now scan forward to see if this is really the last extent. */
- em = get_extent_skip_holes(inode, off, last_for_get_extent);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
}
- if (!em) {
- flags |= FIEMAP_EXTENT_LAST;
- end = 1;
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
}
- ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
- em_len, flags);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out_free;
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
}
+ cond_resched();
}
-out_free:
- if (!ret)
- ret = emit_last_fiemap_cache(fieinfo, &cache);
- free_extent_map(em);
-out:
- unlock_extent_cached(&inode->io_tree, start, start + len - 1,
- &cached_state);
-out_free_ulist:
+check_eof_delalloc:
+ /*
+ * Release (and free) the path before emitting any final entries to
+ * fiemap_fill_next_extent() to keep lockdep happy. This is because
+ * once we find no more file extent items exist, we may have a
+ * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
+ * faults when copying data to the user space buffer.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ if (!stopped && prev_extent_end < lockend) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
+ 0, 0, 0, roots, tmp_ulist,
+ prev_extent_end, lockend - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = lockend;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+
+out_unlock:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+out:
+ kfree(backref_cache);
btrfs_free_path(path);
ulist_free(roots);
ulist_free(tmp_ulist);
@@ -5790,7 +4181,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
return;
}
- if (fs_info->sectorsize == PAGE_SIZE) {
+ if (fs_info->nodesize >= PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -5864,7 +4255,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
- btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+ btrfs_leak_debug_del_eb(eb);
__free_extent_buffer(eb);
}
@@ -5881,8 +4272,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->bflags = 0;
init_rwsem(&eb->lock);
- btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
- &fs_info->allocated_ebs);
+ btrfs_leak_debug_add_eb(eb);
INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
@@ -5897,9 +4287,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
int i;
- struct page *p;
struct extent_buffer *new;
int num_pages = num_extent_pages(src);
+ int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
@@ -5912,22 +4302,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+ memset(new->pages, 0, sizeof(*new->pages) * num_pages);
+ ret = btrfs_alloc_page_array(num_pages, new->pages);
+ if (ret) {
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
+
for (i = 0; i < num_pages; i++) {
int ret;
+ struct page *p = new->pages[i];
- p = alloc_page(GFP_NOFS);
- if (!p) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
ret = attach_extent_buffer_page(new, p, NULL);
if (ret < 0) {
- put_page(p);
btrfs_release_extent_buffer(new);
return NULL;
}
WARN_ON(PageDirty(p));
- new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
set_extent_buffer_uptodate(new);
@@ -5941,31 +4332,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
int num_pages;
int i;
+ int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
num_pages = num_extent_pages(eb);
+ ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ if (ret)
+ goto err;
+
for (i = 0; i < num_pages; i++) {
- int ret;
+ struct page *p = eb->pages[i];
- eb->pages[i] = alloc_page(GFP_NOFS);
- if (!eb->pages[i])
- goto err;
- ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
+ ret = attach_extent_buffer_page(eb, p, NULL);
if (ret < 0)
goto err;
}
+
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
err:
- for (; i > 0; i--) {
- detach_extent_buffer_page(eb, eb->pages[i - 1]);
- __free_page(eb->pages[i - 1]);
+ for (i = 0; i < num_pages; i++) {
+ if (eb->pages[i]) {
+ detach_extent_buffer_page(eb, eb->pages[i]);
+ __free_page(eb->pages[i]);
+ }
}
__free_extent_buffer(eb);
return NULL;
@@ -5987,10 +4383,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
*
* It is only cleared in two cases: freeing the last non-tree
* reference to the extent_buffer when its STALE bit is set or
- * calling releasepage when the tree reference is the only reference.
+ * calling release_folio when the tree reference is the only reference.
*
* In both cases, care is taken to ensure that the extent_buffer's
- * pages are not under io. However, releasepage can be concurrently
+ * pages are not under io. However, release_folio can be concurrently
* called with creating new references, which is prone to race
* conditions between the calls to check_buffer_tree_ref in those
* codepaths and clearing TREE_REF in try_release_extent_buffer.
@@ -6110,7 +4506,7 @@ static struct extent_buffer *grab_extent_buffer(
* don't try to insert two ebs for the same bytenr. So here we always
* return NULL and just continue.
*/
- if (fs_info->sectorsize < PAGE_SIZE)
+ if (fs_info->nodesize < PAGE_SIZE)
return NULL;
/* Page not yet attached to an extent buffer */
@@ -6132,6 +4528,30 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
}
+static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
+{
+ if (!IS_ALIGNED(start, fs_info->sectorsize)) {
+ btrfs_err(fs_info, "bad tree block start %llu", start);
+ return -EINVAL;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE &&
+ offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ if (fs_info->nodesize >= PAGE_SIZE &&
+ !PAGE_ALIGNED(start)) {
+ btrfs_err(fs_info,
+ "tree block is not page aligned, start %llu nodesize %u",
+ start, fs_info->nodesize);
+ return -EINVAL;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
@@ -6143,13 +4563,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
struct extent_buffer *exists = NULL;
struct page *p;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ u64 lockdep_owner = owner_root;
int uptodate = 1;
int ret;
- if (!IS_ALIGNED(start, fs_info->sectorsize)) {
- btrfs_err(fs_info, "bad tree block start %llu", start);
+ if (check_eb_alignment(fs_info, start))
return ERR_PTR(-EINVAL);
- }
#if BITS_PER_LONG == 32
if (start >= MAX_LFS_FILESIZE) {
@@ -6162,14 +4581,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_warn_32bit_limit(fs_info);
#endif
- if (fs_info->sectorsize < PAGE_SIZE &&
- offset_in_page(start) + len > PAGE_SIZE) {
- btrfs_err(fs_info,
- "tree block crosses page boundary, start %llu nodesize %lu",
- start, len);
- return ERR_PTR(-EINVAL);
- }
-
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
@@ -6177,7 +4588,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return ERR_PTR(-ENOMEM);
- btrfs_set_buffer_lockdep_class(owner_root, eb, level);
+
+ /*
+ * The reloc trees are just snapshots, so we need them to appear to be
+ * just like any other fs tree WRT lockdep.
+ */
+ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
+ lockdep_owner = BTRFS_FS_TREE_OBJECTID;
+
+ btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
@@ -6199,7 +4618,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
@@ -6243,7 +4662,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
/*
* We can't unlock the pages just yet since the extent buffer
* hasn't been properly inserted in the radix tree, this
- * opens a race with btree_releasepage which can free a page
+ * opens a race with btree_release_folio which can free a page
* while we are still filling in all pages for the buffer and
* we could crash.
*/
@@ -6275,7 +4694,7 @@ again:
/*
* Now it's safe to unlock the pages because any calls to
- * btree_releasepage will correctly detect that a page belongs to a
+ * btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
for (i = 0; i < num_pages; i++)
@@ -6321,7 +4740,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
- btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+ btrfs_leak_debug_del_eb(eb);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -6341,18 +4760,16 @@ static int release_extent_buffer(struct extent_buffer *eb)
void free_extent_buffer(struct extent_buffer *eb)
{
int refs;
- int old;
if (!eb)
return;
+ refs = atomic_read(&eb->refs);
while (1) {
- refs = atomic_read(&eb->refs);
if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
|| (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
refs == 1))
break;
- old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
- if (old == refs)
+ if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
return;
}
@@ -6418,7 +4835,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb)
int num_pages;
struct page *page;
- if (eb->fs_info->sectorsize < PAGE_SIZE)
+ if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
num_pages = num_extent_pages(eb);
@@ -6450,7 +4867,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
+ bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
/*
* For subpage case, we can have other extent buffers in the
@@ -6490,9 +4907,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- if (page)
- btrfs_page_clear_uptodate(fs_info, page,
- eb->start, eb->len);
+ if (!page)
+ continue;
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ ClearPageUptodate(page);
+ else
+ btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
@@ -6507,7 +4933,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
+
+ /*
+ * This is special handling for metadata subpage, as regular
+ * btrfs_is_subpage() can not handle cloned/dummy metadata.
+ */
+ if (fs_info->nodesize >= PAGE_SIZE)
+ SetPageUptodate(page);
+ else
+ btrfs_subpage_set_uptodate(fs_info, page, eb->start,
+ eb->len);
}
}
@@ -6517,7 +4952,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
int ret = 0;
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
@@ -6528,7 +4965,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
return -EAGAIN;
} else {
- ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
if (ret < 0)
return ret;
}
@@ -6538,7 +4975,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
PageUptodate(page) ||
btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
return ret;
}
@@ -6546,14 +4983,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
eb->read_mirror = 0;
atomic_set(&eb->io_pages, 1);
check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
+
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
- ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
- page, eb->start, eb->len,
- eb->start - page_offset(page),
- end_bio_extent_readpage, mirror_num, 0,
- true);
+ ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
+ eb->start, page, eb->len,
+ eb->start - page_offset(page), 0, true);
if (ret) {
/*
* In the endio function, if we hit something wrong we will
@@ -6562,14 +4999,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
- if (bio_ctrl.bio) {
- int tmp;
-
- tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
- bio_ctrl.bio = NULL;
- if (tmp < 0)
- return tmp;
- }
+ submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -6589,7 +5019,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .mirror_num = mirror_num,
+ };
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -6602,7 +5034,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
return -EIO;
- if (eb->fs_info->sectorsize < PAGE_SIZE)
+ if (eb->fs_info->nodesize < PAGE_SIZE)
return read_extent_buffer_subpage(eb, wait, mirror_num);
num_pages = num_extent_pages(eb);
@@ -6645,10 +5077,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
/*
- * It is possible for releasepage to clear the TREE_REF bit before we
+ * It is possible for release_folio to clear the TREE_REF bit before we
* set io_pages. See check_buffer_tree_ref for a more detailed comment.
*/
check_buffer_tree_ref(eb);
+ bio_ctrl.end_io_func = end_bio_extent_readpage;
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
@@ -6660,10 +5093,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
- &bio_ctrl, page, page_offset(page),
- PAGE_SIZE, 0, end_bio_extent_readpage,
- mirror_num, 0, false);
+ err = submit_extent_page(REQ_OP_READ, NULL,
+ &bio_ctrl, page_offset(page), page,
+ PAGE_SIZE, 0, 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
@@ -6680,12 +5112,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
- if (bio_ctrl.bio) {
- err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
- bio_ctrl.bio = NULL;
- if (err)
- return err;
- }
+ submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
return ret;
@@ -6857,7 +5284,7 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
* would have !PageUptodate && !PageError, as we clear PageError before
* reading.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->nodesize < PAGE_SIZE) {
bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
@@ -6959,7 +5386,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
ASSERT(dst->len == src->len);
- if (dst->fs_info->sectorsize == PAGE_SIZE) {
+ if (dst->fs_info->nodesize >= PAGE_SIZE) {
num_pages = num_extent_pages(dst);
for (i = 0; i < num_pages; i++)
copy_page(page_address(dst->pages[i]),
@@ -6968,7 +5395,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
size_t src_offset = get_eb_offset_in_page(src, 0);
size_t dst_offset = get_eb_offset_in_page(dst, 0);
- ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(src->fs_info->nodesize < PAGE_SIZE);
memcpy(page_address(dst->pages[0]) + dst_offset,
page_address(src->pages[0]) + src_offset,
src->len);
@@ -7361,7 +5788,7 @@ int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
- if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0399cf8e3c32..7929f054dda3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,15 +7,9 @@
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
+#include "compression.h"
#include "ulist.h"
-/*
- * flags for bio submission. The high bits indicate the compression
- * type for this bio
- */
-#define EXTENT_BIO_COMPRESSED 1
-#define EXTENT_BIO_FLAG_SHIFT 16
-
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
@@ -32,7 +26,6 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
- EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@@ -64,16 +57,19 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+struct btrfs_bio;
struct btrfs_root;
struct btrfs_inode;
-struct btrfs_io_bio;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
-typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+int __init extent_buffer_init_cachep(void);
+void __cold extent_buffer_free_cachep(void);
+
+typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
- unsigned long bio_flags);
+ enum btrfs_compression_type compress_type);
typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
struct bio *bio, u64 dio_file_offset);
@@ -103,22 +99,11 @@ struct extent_buffer {
};
/*
- * Structure to record info about the bio being assembled, and other info like
- * how many bytes are there before stripe/ordered extent boundary.
- */
-struct btrfs_bio_ctrl {
- struct bio *bio;
- unsigned long bio_flags;
- u32 len_to_stripe_boundary;
- u32 len_to_oe_boundary;
-};
-
-/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
@@ -158,32 +143,12 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-static inline void extent_set_compress_type(unsigned long *bio_flags,
- int compress_type)
-{
- *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
-}
-
-static inline int extent_compress_type(unsigned long bio_flags)
-{
- return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
-}
-
struct extent_map_tree;
-typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
-
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags);
-int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl,
- unsigned int read_flags, u64 *prev_em_start);
-int extent_write_full_page(struct page *page, struct writeback_control *wbc);
+int btrfs_read_folio(struct file *file, struct folio *folio);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
@@ -277,9 +242,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone(struct bio *bio);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
+
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
@@ -293,20 +259,25 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
struct page *page;
- u64 start;
u64 len;
u64 logical;
- unsigned long bio_flags;
int this_mirror;
int failed_mirror;
+ int num_copies;
};
-int btrfs_repair_one_sector(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, int failed_mirror,
+int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
+ u32 bio_offset, struct page *page, unsigned int pgoff,
submit_bio_hook_t *submit_bio_hook);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+ struct page *page, unsigned int pg_offset);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6fee14ce2e6b..6092a4eedc92 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -7,6 +7,7 @@
#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
+#include "btrfs_inode.h"
static struct kmem_cache *extent_map_cache;
@@ -54,9 +55,7 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
- em->generation = 0;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -73,7 +72,6 @@ void free_extent_map(struct extent_map *em)
{
if (!em)
return;
- WARN_ON(refcount_read(&em->refs) == 0);
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
@@ -143,8 +141,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
* it can't be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
- struct rb_node **prev_ret,
- struct rb_node **next_ret)
+ struct rb_node **prev_or_next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
@@ -152,6 +149,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct extent_map *entry;
struct extent_map *prev_entry = NULL;
+ ASSERT(prev_or_next_ret);
+
while (n) {
entry = rb_entry(n, struct extent_map, rb_node);
prev = n;
@@ -165,24 +164,29 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return n;
}
- if (prev_ret) {
- orig_prev = prev;
- while (prev && offset >= extent_map_end(prev_entry)) {
- prev = rb_next(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *prev_ret = prev;
- prev = orig_prev;
+ orig_prev = prev;
+ while (prev && offset >= extent_map_end(prev_entry)) {
+ prev = rb_next(prev);
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ }
+
+ /*
+ * Previous extent map found, return as in this case the caller does not
+ * care about the next one.
+ */
+ if (prev) {
+ *prev_or_next_ret = prev;
+ return NULL;
}
- if (next_ret) {
+ prev = orig_prev;
+ prev_entry = rb_entry(prev, struct extent_map, rb_node);
+ while (prev && offset < prev_entry->start) {
+ prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
- while (prev && offset < prev_entry->start) {
- prev = rb_prev(prev);
- prev_entry = rb_entry(prev, struct extent_map, rb_node);
- }
- *next_ret = prev;
}
+ *prev_or_next_ret = prev;
+
return NULL;
}
@@ -336,6 +340,8 @@ out:
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
if (extent_map_in_tree(em))
try_merge_map(tree, em);
@@ -382,7 +388,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
__clear_extent_bit(&device->alloc_state, stripe->physical,
stripe->physical + stripe_size - 1, bits,
- 0, 0, NULL, GFP_NOWAIT, NULL);
+ NULL, GFP_NOWAIT, NULL);
}
}
@@ -425,16 +431,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
{
struct extent_map *em;
struct rb_node *rb_node;
- struct rb_node *prev = NULL;
- struct rb_node *next = NULL;
+ struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
if (!rb_node) {
- if (prev)
- rb_node = prev;
- else if (next)
- rb_node = next;
+ if (prev_or_next)
+ rb_node = prev_or_next;
else
return NULL;
}
@@ -658,3 +661,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
ASSERT(ret == 0 || ret == -EEXIST);
return ret;
}
+
+/*
+ * Drop all extent maps from a tree in the fastest possible way, rescheduling
+ * if needed. This avoids searching the tree, from the root down to the first
+ * extent map, before each deletion.
+ */
+static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+{
+ write_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ struct extent_map *em;
+ struct rb_node *node;
+
+ node = rb_first_cached(&tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ remove_extent_mapping(tree, em);
+ free_extent_map(em);
+ cond_resched_rwlock_write(&tree->lock);
+ }
+ write_unlock(&tree->lock);
+}
+
+/*
+ * Drop all extent maps in a given range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the range.
+ * @end: End offset of the range (inclusive value).
+ * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
+ *
+ * This drops all the extent maps that intersect the given range [@start, @end].
+ * Extent maps that partially overlap the range and extend behind or beyond it,
+ * are split.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
+ bool skip_pinned)
+{
+ struct extent_map *split;
+ struct extent_map *split2;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 len = end - start + 1;
+
+ WARN_ON(end < start);
+ if (end == (u64)-1) {
+ if (start == 0 && !skip_pinned) {
+ drop_all_extent_maps_fast(em_tree);
+ return;
+ }
+ len = (u64)-1;
+ } else {
+ /* Make end offset exclusive for use in the loop below. */
+ end++;
+ }
+
+ /*
+ * It's ok if we fail to allocate the extent maps, see the comment near
+ * the bottom of the loop below. We only need two spare extent maps in
+ * the worst case, where the first extent map that intersects our range
+ * starts before the range and the last extent map that intersects our
+ * range ends after our range (and they might be the same extent map),
+ * because we need to split those two extent maps at the boundaries.
+ */
+ split = alloc_extent_map();
+ split2 = alloc_extent_map();
+
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+
+ while (em) {
+ /* extent_map_end() returns exclusive value (last byte + 1). */
+ const u64 em_end = extent_map_end(em);
+ struct extent_map *next_em = NULL;
+ u64 gen;
+ unsigned long flags;
+ bool modified;
+ bool compressed;
+
+ if (em_end < end) {
+ next_em = next_extent_map(em);
+ if (next_em) {
+ if (next_em->start < end)
+ refcount_inc(&next_em->refs);
+ else
+ next_em = NULL;
+ }
+ }
+
+ if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ start = em_end;
+ if (end != (u64)-1)
+ len = start + len - em_end;
+ goto next;
+ }
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
+
+ /*
+ * The extent map does not cross our target range, so no need to
+ * split it, we can remove it directly.
+ */
+ if (em->start >= start && em_end <= end)
+ goto remove_em;
+
+ flags = em->flags;
+ gen = em->generation;
+ compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+ if (em->start < start) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = em->start;
+ split->len = start - em->start;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_start = em->orig_start;
+ split->block_start = em->block_start;
+
+ if (compressed)
+ split->block_len = em->block_len;
+ else
+ split->block_len = split->len;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
+ } else {
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->block_start = em->block_start;
+ split->orig_block_len = 0;
+ split->ram_bytes = split->len;
+ }
+
+ split->generation = gen;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ replace_extent_mapping(em_tree, em, split, modified);
+ free_extent_map(split);
+ split = split2;
+ split2 = NULL;
+ }
+ if (em_end > end) {
+ if (!split) {
+ split = split2;
+ split2 = NULL;
+ if (!split)
+ goto remove_em;
+ }
+ split->start = start + len;
+ split->len = em_end - (start + len);
+ split->block_start = em->block_start;
+ split->flags = flags;
+ split->compress_type = em->compress_type;
+ split->generation = gen;
+
+ if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
+
+ split->ram_bytes = em->ram_bytes;
+ if (compressed) {
+ split->block_len = em->block_len;
+ split->orig_start = em->orig_start;
+ } else {
+ const u64 diff = start + len - em->start;
+
+ split->block_len = split->len;
+ split->block_start += diff;
+ split->orig_start = em->orig_start;
+ }
+ } else {
+ split->ram_bytes = split->len;
+ split->orig_start = split->start;
+ split->block_len = 0;
+ split->orig_block_len = 0;
+ }
+
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ int ret;
+
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ /* Logic error, shouldn't happen. */
+ ASSERT(ret == 0);
+ if (WARN_ON(ret != 0) && modified)
+ btrfs_set_inode_full_sync(inode);
+ }
+ free_extent_map(split);
+ split = NULL;
+ }
+remove_em:
+ if (extent_map_in_tree(em)) {
+ /*
+ * If the extent map is still in the tree it means that
+ * either of the following is true:
+ *
+ * 1) It fits entirely in our range (doesn't end beyond
+ * it or starts before it);
+ *
+ * 2) It starts before our range and/or ends after our
+ * range, and we were not able to allocate the extent
+ * maps for split operations, @split and @split2.
+ *
+ * If we are at case 2) then we just remove the entire
+ * extent map - this is fine since if anyone needs it to
+ * access the subranges outside our range, will just
+ * load it again from the subvolume tree's file extent
+ * item. However if the extent map was in the list of
+ * modified extents, then we must mark the inode for a
+ * full fsync, otherwise a fast fsync will miss this
+ * extent if it's new and needs to be logged.
+ */
+ if ((em->start < start || em_end > end) && modified) {
+ ASSERT(!split);
+ btrfs_set_inode_full_sync(inode);
+ }
+ remove_extent_mapping(em_tree, em);
+ }
+
+ /*
+ * Once for the tree reference (we replaced or removed the
+ * extent map from the tree).
+ */
+ free_extent_map(em);
+next:
+ /* Once for us (for our lookup reference). */
+ free_extent_map(em);
+
+ em = next_em;
+ }
+
+ write_unlock(&em_tree->lock);
+
+ free_extent_map(split);
+ free_extent_map(split2);
+}
+
+/*
+ * Replace a range in the inode's extent map tree with a new extent map.
+ *
+ * @inode: The target inode.
+ * @new_em: The new extent map to add to the inode's extent map tree.
+ * @modified: Indicate if the new extent map should be added to the list of
+ * modified extents (for fast fsync tracking).
+ *
+ * Drops all the extent maps in the inode's extent map tree that intersect the
+ * range of the new extent map and adds the new extent map to the tree.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified)
+{
+ const u64 end = new_em->start + new_em->len - 1;
+ struct extent_map_tree *tree = &inode->extent_tree;
+ int ret;
+
+ ASSERT(!extent_map_in_tree(new_em));
+
+ /*
+ * The caller has locked an appropriate file range in the inode's io
+ * tree, but getting -EEXIST when adding the new extent map can still
+ * happen in case there are extents that partially cover the range, and
+ * this is due to two tasks operating on different parts of the extent.
+ * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
+ * btrfs_get_extent") for an example and details.
+ */
+ do {
+ btrfs_drop_extent_map_range(inode, new_em->start, end, false);
+ write_lock(&tree->lock);
+ ret = add_extent_mapping(tree, new_em, modified);
+ write_unlock(&tree->lock);
+ } while (ret == -EEXIST);
+
+ return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index d2fa32ffe304..ad311864272a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -63,6 +63,8 @@ struct extent_map_tree {
rwlock_t lock;
};
+struct btrfs_inode;
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
@@ -104,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
+ u64 start, u64 end,
+ bool skip_pinned);
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+ struct extent_map *new_em,
+ bool modified);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c828f971a346..6bb9fa961a6a 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -118,7 +118,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return clear_extent_bit(&inode->file_extent_tree, start,
- start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
+ start + len - 1, EXTENT_DIRTY, NULL);
}
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
@@ -129,12 +129,20 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
return ncsums * fs_info->sectorsize;
}
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+/*
+ * Calculate the total size needed to allocate for an ordered sum structure
+ * spanning @bytes in the file.
+ */
+static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+{
+ int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
+
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
+}
+
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 objectid, u64 pos,
- u64 disk_offset, u64 disk_num_bytes,
- u64 num_bytes, u64 offset, u64 ram_bytes,
- u8 compression, u8 encryption, u16 other_encoding)
+ u64 objectid, u64 pos, u64 num_bytes)
{
int ret = 0;
struct btrfs_file_extent_item *item;
@@ -157,16 +165,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
- btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
- btrfs_set_file_extent_offset(leaf, item, offset);
+ btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
+ btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
+ btrfs_set_file_extent_offset(leaf, item, 0);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_compression(leaf, item, compression);
- btrfs_set_file_extent_encryption(leaf, item, encryption);
- btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+ btrfs_set_file_extent_compression(leaf, item, 0);
+ btrfs_set_file_extent_encryption(leaf, item, 0);
+ btrfs_set_file_extent_other_encoding(leaf, item, 0);
btrfs_mark_buffer_dirty(leaf);
out:
@@ -503,7 +511,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list, int search_commit)
+ struct list_head *list, int search_commit,
+ bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
@@ -525,6 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
if (search_commit) {
path->skip_locking = 1;
path->reada = READA_FORWARD;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9f455c96c974..176b432035ae 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -473,7 +473,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
*/
clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached);
+ cached);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
@@ -499,159 +499,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
}
/*
- * this drops all the extents in the cache that intersect the range
- * [start, end]. Existing extents are split as required.
- */
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
- int skip_pinned)
-{
- struct extent_map *em;
- struct extent_map *split = NULL;
- struct extent_map *split2 = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
- u64 len = end - start + 1;
- u64 gen;
- int ret;
- int testend = 1;
- unsigned long flags;
- int compressed = 0;
- bool modified;
-
- WARN_ON(end < start);
- if (end == (u64)-1) {
- len = (u64)-1;
- testend = 0;
- }
- while (1) {
- int no_splits = 0;
-
- modified = false;
- if (!split)
- split = alloc_extent_map();
- if (!split2)
- split2 = alloc_extent_map();
- if (!split || !split2)
- no_splits = 1;
-
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em) {
- write_unlock(&em_tree->lock);
- break;
- }
- flags = em->flags;
- gen = em->generation;
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- if (testend && em->start + em->len >= start + len) {
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- break;
- }
- start = em->start + em->len;
- if (testend)
- len = start + len - (em->start + em->len);
- free_extent_map(em);
- write_unlock(&em_tree->lock);
- continue;
- }
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
- modified = !list_empty(&em->list);
- if (no_splits)
- goto next;
-
- if (em->start < start) {
- split->start = em->start;
- split->len = start - em->start;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
-
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
- split->ram_bytes = em->ram_bytes;
- } else {
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- split->ram_bytes = split->len;
- }
-
- split->generation = gen;
- split->flags = flags;
- split->compress_type = em->compress_type;
- replace_extent_mapping(em_tree, em, split, modified);
- free_extent_map(split);
- split = split2;
- split2 = NULL;
- }
- if (testend && em->start + em->len > start + len) {
- u64 diff = start + len - em->start;
-
- split->start = start + len;
- split->len = em->start + em->len - (start + len);
- split->flags = flags;
- split->compress_type = em->compress_type;
- split->generation = gen;
-
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_block_len = max(em->block_len,
- em->orig_block_len);
-
- split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->block_start = em->block_start;
- split->orig_start = em->orig_start;
- } else {
- split->block_len = split->len;
- split->block_start = em->block_start
- + diff;
- split->orig_start = em->orig_start;
- }
- } else {
- split->ram_bytes = split->len;
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
- }
-
- if (extent_map_in_tree(em)) {
- replace_extent_mapping(em_tree, em, split,
- modified);
- } else {
- ret = add_extent_mapping(em_tree, split,
- modified);
- ASSERT(ret == 0); /* Logic error */
- }
- free_extent_map(split);
- split = NULL;
- }
-next:
- if (extent_map_in_tree(em))
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us */
- free_extent_map(em);
- /* once for the tree*/
- free_extent_map(em);
- }
- if (split)
- free_extent_map(split);
- if (split2)
- free_extent_map(split2);
-}
-
-/*
* this is very complex, but the basic idea is to drop all extents
* in the range start - end. hint_block is filled in with a block number
* that would be a good hint to the block allocator for this file.
@@ -708,7 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
}
if (args->drop_cache)
- btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
+ btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
@@ -1307,11 +1154,12 @@ static int prepare_uptodate_page(struct inode *inode,
struct page *page, u64 pos,
bool force_uptodate)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
!PageUptodate(page)) {
- ret = btrfs_readpage(NULL, page);
+ ret = btrfs_read_folio(NULL, folio);
if (ret)
return ret;
lock_page(page);
@@ -1321,8 +1169,8 @@ static int prepare_uptodate_page(struct inode *inode,
}
/*
- * Since btrfs_readpage() will unlock the page before it
- * returns, there is a window where btrfs_releasepage() can be
+ * Since btrfs_read_folio() will unlock the folio before it
+ * returns, there is a window where btrfs_release_folio() can be
* called to release the page. Here we check both inode
* mapping and PagePrivate() to make sure the page was not
* released.
@@ -1338,26 +1186,54 @@ static int prepare_uptodate_page(struct inode *inode,
return 0;
}
+static unsigned int get_prepare_fgp_flags(bool nowait)
+{
+ unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+
+ if (nowait)
+ fgp_flags |= FGP_NOWAIT;
+
+ return fgp_flags;
+}
+
+static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
+{
+ gfp_t gfp;
+
+ gfp = btrfs_alloc_write_mask(inode->i_mapping);
+ if (nowait) {
+ gfp &= ~__GFP_DIRECT_RECLAIM;
+ gfp |= GFP_NOWAIT;
+ }
+
+ return gfp;
+}
+
/*
* this just gets pages into the page cache and locks them down.
*/
static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
- size_t write_bytes, bool force_uptodate)
+ size_t write_bytes, bool force_uptodate,
+ bool nowait)
{
int i;
unsigned long index = pos >> PAGE_SHIFT;
- gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+ gfp_t mask = get_prepare_gfp_flags(inode, nowait);
+ unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
int err = 0;
int faili;
for (i = 0; i < num_pages; i++) {
again:
- pages[i] = find_or_create_page(inode->i_mapping, index + i,
- mask | __GFP_WRITE);
+ pages[i] = pagecache_get_page(inode->i_mapping, index + i,
+ fgp_flags, mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
- err = -ENOMEM;
+ if (nowait)
+ err = -EAGAIN;
+ else
+ err = -ENOMEM;
goto fail;
}
@@ -1375,7 +1251,7 @@ again:
pos + write_bytes, false);
if (err) {
put_page(pages[i]);
- if (err == -EAGAIN) {
+ if (!nowait && err == -EAGAIN) {
err = 0;
goto again;
}
@@ -1410,7 +1286,7 @@ static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
size_t write_bytes,
- u64 *lockstart, u64 *lockend,
+ u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1425,15 +1301,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(&inode->io_tree, start_pos, last_pos,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+
+ return -EAGAIN;
+ }
+ } else {
+ lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
- unlock_extent_cached(&inode->io_tree, start_pos,
- last_pos, cached_state);
+ unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@@ -1460,7 +1348,26 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
-static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+/*
+ * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
+ *
+ * @pos: File offset.
+ * @write_bytes: The length to write, will be updated to the nocow writeable
+ * range.
+ *
+ * This function will flush ordered extents in the range to ensure proper
+ * nocow checks.
+ *
+ * Return:
+ * > 0 If we can nocow, and updates @write_bytes.
+ * 0 If we can't do a nocow write.
+ * -EAGAIN If we can't do a nocow write because snapshoting of the inode's
+ * root is in progress.
+ * < 0 If an error happened.
+ *
+ * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
+ */
+int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1472,7 +1379,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
- if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
@@ -1481,70 +1388,25 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
num_bytes = lockend - lockstart + 1;
if (nowait) {
- struct btrfs_ordered_extent *ordered;
-
- if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
+ if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+ btrfs_drew_write_unlock(&root->snapshot_lock);
return -EAGAIN;
-
- ordered = btrfs_lookup_ordered_range(inode, lockstart,
- num_bytes);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- ret = -EAGAIN;
- goto out_unlock;
}
} else {
- btrfs_lock_and_flush_ordered_range(inode, lockstart,
- lockend, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
}
-
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL, false);
- if (ret <= 0) {
- ret = 0;
- if (!nowait)
- btrfs_drew_write_unlock(&root->snapshot_lock);
- } else {
+ NULL, NULL, NULL, nowait, false);
+ if (ret <= 0)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ else
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
- }
-out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend);
+ unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
return ret;
}
-static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
-{
- return check_can_nocow(inode, pos, write_bytes, true);
-}
-
-/*
- * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
- *
- * @pos: File offset
- * @write_bytes: The length to write, will be updated to the nocow writeable
- * range
- *
- * This function will flush ordered extents in the range to ensure proper
- * nocow checks.
- *
- * Return:
- * >0 and update @write_bytes if we can do nocow write
- * 0 if we can't do nocow write
- * -EAGAIN if we can't get the needed lock or there are ordered extents
- * for * (nowait == true) case
- * <0 if other error happened
- *
- * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
- */
-int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
-{
- return check_can_nocow(inode, pos, write_bytes, false);
-}
-
void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
{
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
@@ -1579,20 +1441,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
loff_t oldsize;
loff_t start_pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- size_t nocow_bytes = count;
-
- /* We will allocate space in case nodatacow is not set, so bail */
- if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
- return -EAGAIN;
- /*
- * There are holes in the range or parts of the range that must
- * be COWed (shared extents, RO block groups, etc), so just bail
- * out.
- */
- if (nocow_bytes < count)
- return -EAGAIN;
- }
+ /*
+ * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
+ * prealloc flags, as without those flags we always have to COW. We will
+ * later check if we can really COW into the target range (using
+ * can_nocow_extent() at btrfs_get_blocks_direct_write()).
+ */
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ return -EAGAIN;
current->backing_dev_info = inode_to_bdi(inode);
ret = file_remove_privs(file);
@@ -1642,8 +1499,10 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
+ unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
ret = btrfs_inode_lock(inode, ilock_flags);
@@ -1699,18 +1558,29 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
extent_changeset_release(data_reserved);
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&data_reserved, pos,
- write_bytes);
+ write_bytes, nowait);
if (ret < 0) {
+ int can_nocow;
+
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
+ ret = -EAGAIN;
+ break;
+ }
+
/*
* If we don't have to COW at the offset, reserve
* metadata only. write_bytes may get smaller than
* requested here.
*/
- if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes) > 0)
- only_release_metadata = true;
- else
+ can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+ &write_bytes, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
break;
+ only_release_metadata = true;
}
num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
@@ -1720,7 +1590,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes,
- reserve_bytes);
+ reserve_bytes, nowait);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -1733,14 +1603,17 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
release_bytes = reserve_bytes;
again:
+ ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
+ if (ret)
+ break;
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
* contents of pages from loop to loop
*/
ret = prepare_pages(inode, pages, num_pages,
- pos, write_bytes,
- force_page_uptodate);
+ pos, write_bytes, force_page_uptodate, false);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
@@ -1750,10 +1623,11 @@ again:
extents_locked = lock_and_cleanup_extent_if_need(
BTRFS_I(inode), pages,
num_pages, pos, write_bytes, &lockstart,
- &lockend, &cached_state);
+ &lockend, nowait, &cached_state);
if (extents_locked < 0) {
- if (extents_locked == -EAGAIN)
+ if (!nowait && extents_locked == -EAGAIN)
goto again;
+
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
ret = extents_locked;
@@ -1817,8 +1691,8 @@ again:
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
else
free_extent_state(cached_state);
@@ -1836,8 +1710,6 @@ again:
cond_resched();
- balance_dirty_pages_ratelimited(inode->i_mapping);
-
pos += copied;
num_written += copied;
}
@@ -1883,7 +1755,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
- const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1937,15 +1808,6 @@ relock:
}
/*
- * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
- * calls generic_write_sync() (through iomap_dio_complete()), because
- * that results in calling fsync (btrfs_sync_file()) which will try to
- * lock the inode in exclusive/write mode.
- */
- if (is_sync_write)
- iocb->ki_flags &= ~IOCB_DSYNC;
-
- /*
* The iov_iter can be mapped to the same file range we are writing to.
* If that's the case, then we will deadlock in the iomap code, because
* it first calls our callback btrfs_dio_iomap_begin(), which will create
@@ -1965,8 +1827,7 @@ relock:
*/
again:
from->nofault = true;
- err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, written);
+ err = btrfs_dio_rw(iocb, from, written);
from->nofault = false;
/* No increment (+=) because iomap returns a cumulative value. */
@@ -2001,17 +1862,24 @@ again:
btrfs_inode_unlock(inode, ilock_flags);
/*
- * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
- * the fsync (call generic_write_sync()).
+ * If 'err' is -ENOTBLK or we have not written all data, then it means
+ * we must fallback to buffered IO.
*/
- if (is_sync_write)
- iocb->ki_flags |= IOCB_DSYNC;
-
- /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
goto out;
buffered:
+ /*
+ * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+ * it must retry the operation in a context where blocking is acceptable,
+ * since we currently don't have NOWAIT semantics support for buffered IO
+ * and may block there for many reasons (reserving space for example).
+ */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ err = -EAGAIN;
+ goto out;
+ }
+
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
@@ -2074,7 +1942,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written, num_sync;
- const bool sync = iocb->ki_flags & IOCB_DSYNC;
+ const bool sync = iocb_is_dsync(iocb);
/*
* If the fs flips readonly due to some impossible error, although we
@@ -2084,7 +1952,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
if (sync)
@@ -2094,9 +1962,11 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
num_written = btrfs_encoded_write(iocb, from, encoded);
num_sync = encoded->len;
} else if (iocb->ki_flags & IOCB_DIRECT) {
- num_written = num_sync = btrfs_direct_write(iocb, from);
+ num_written = btrfs_direct_write(iocb, from);
+ num_sync = num_written;
} else {
- num_written = num_sync = btrfs_buffered_write(iocb, from);
+ num_written = btrfs_buffered_write(iocb, from);
+ num_sync = num_written;
}
btrfs_set_inode_last_sub_trans(inode);
@@ -2238,14 +2108,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * Always check for the full sync flag while holding the inode's lock,
- * to avoid races with other tasks. The flag must be either set all the
- * time during logging or always off all the time while logging.
- */
- full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
-
- /*
* Before we acquired the inode's lock and the mmap lock, someone may
* have dirtied more pages in the target range. We need to make sure
* that writeback for any such pages does not start while we are logging
@@ -2270,6 +2132,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
/*
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
+ * We check the flag here after starting delalloc above, because when
+ * running delalloc the full sync flag may be set if we need to drop
+ * extra extent map ranges due to temporary memory allocation failures.
+ */
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+
+ /*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
@@ -2344,7 +2217,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
/* we've logged all the items and now have a consistent
@@ -2359,27 +2232,65 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret == BTRFS_NO_LOG_SYNC) {
+ ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ /* We successfully logged the inode, attempt to sync the log. */
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
- ret = btrfs_sync_log(trans, root, &ctx);
- if (!ret) {
- ret = btrfs_end_transaction(trans);
- goto out;
- }
- }
- if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start, len);
- if (ret) {
- btrfs_end_transaction(trans);
- goto out;
- }
+ ret = btrfs_end_transaction(trans);
+ goto out;
}
- ret = btrfs_commit_transaction(trans);
- } else {
+ }
+
+ /*
+ * At this point we need to commit the transaction because we had
+ * btrfs_need_log_full_commit() or some other error.
+ *
+ * If we didn't do a full sync we have to stop the trans handle, wait on
+ * the ordered extents, start it again and commit the transaction. If
+ * we attempt to wait on the ordered extents here we could deadlock with
+ * something like fallocate() that is holding the extent lock trying to
+ * start a transaction while some other thread is trying to commit the
+ * transaction while we (fsync) are currently holding the transaction
+ * open.
+ */
+ if (!full_sync) {
ret = btrfs_end_transaction(trans);
+ if (ret)
+ goto out;
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret)
+ goto out;
+
+ /*
+ * This is safe to use here because we're only interested in
+ * making sure the transaction that had the ordered extents is
+ * committed. We aren't waiting on anything past this point,
+ * we're purely getting the transaction and committing it.
+ */
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+
+ /*
+ * We committed the transaction and there's no currently
+ * running transaction, this means everything we care
+ * about made it to disk and we are done.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
}
+
+ ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
+ ASSERT(list_empty(&ctx.conflict_inodes));
err = file_check_and_advance_wb_err(file);
if (!ret)
ret = err;
@@ -2401,7 +2312,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct address_space *mapping = filp->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(filp);
@@ -2448,7 +2359,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct extent_map *hole_em;
- struct extent_map_tree *em_tree = &inode->extent_tree;
struct btrfs_key key;
int ret;
@@ -2482,6 +2392,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
@@ -2498,13 +2409,14 @@ static int fill_holes(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
- offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
+ end - offset);
if (ret)
return ret;
@@ -2513,7 +2425,7 @@ out:
hole_em = alloc_extent_map();
if (!hole_em) {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ btrfs_drop_extent_map_range(inode, offset, end - 1, false);
btrfs_set_inode_full_sync(inode);
} else {
hole_em->start = offset;
@@ -2527,12 +2439,7 @@ out:
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
- do {
- btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- } while (ret == -EEXIST);
+ ret = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
if (ret)
btrfs_set_inode_full_sync(inode);
@@ -2570,10 +2477,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
return ret;
}
-static int btrfs_punch_hole_lock_range(struct inode *inode,
- const u64 lockstart,
- const u64 lockend,
- struct extent_state **cached_state)
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart,
+ const u64 lockend,
+ struct extent_state **cached_state)
{
/*
* For subpage case, if the range is not at page boundary, we could
@@ -2587,40 +2494,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
while (1) {
- struct btrfs_ordered_extent *ordered;
- int ret;
-
truncate_pagecache_range(inode, lockstart, lockend);
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
- lockend);
-
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
/*
- * We need to make sure we have no ordered extents in this range
- * and nobody raced in and read a page in this range, if we did
- * we need to try again.
+ * We can't have ordered extents in the range, nor dirty/writeback
+ * pages, because we have locked the inode's VFS lock in exclusive
+ * mode, we have locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range and we have waited
+ * for any ordered extents in the range to complete.
+ * We can race with anyone reading pages from this range, so after
+ * locking the range check if we have pages in the range, and if
+ * we do, unlock the range and retry.
*/
- if ((!ordered ||
- (ordered->file_offset + ordered->num_bytes <= lockstart ||
- ordered->file_offset > lockend)) &&
- !filemap_range_has_page(inode->i_mapping,
- page_lockstart, page_lockend)) {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
+ if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
+ page_lockend))
break;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, cached_state);
- ret = btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
- if (ret)
- return ret;
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
}
- return 0;
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
}
static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
@@ -2744,7 +2640,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
goto out;
}
rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ rsv->failfast = true;
/*
* 1 - update the inode
@@ -2766,7 +2662,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
- BUG_ON(ret);
+ if (WARN_ON(ret))
+ goto out_trans;
trans->block_rsv = rsv;
cur_offset = start;
@@ -2850,6 +2747,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
extent_info->file_offset += replace_len;
}
+ /*
+ * We are releasing our handle on the transaction, balance the
+ * dirty pages of the btree inode and flush delayed items, and
+ * then get a new transaction handle, which may now point to a
+ * new transaction in case someone else may have committed the
+ * transaction we used to replace/drop file extent items. So
+ * bump the inode's iversion and update mtime and ctime except
+ * if we are called from a dedupe context. This is because a
+ * power failure/crash may happen after the transaction is
+ * committed and before we finish replacing/dropping all the
+ * file extent items we need.
+ */
+ inode_inc_iversion(&inode->vfs_inode);
+
+ if (!extent_info || extent_info->update_times) {
+ inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
+ }
+
ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -2866,7 +2782,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
- BUG_ON(ret); /* shouldn't happen */
+ if (WARN_ON(ret))
+ break;
trans->block_rsv = rsv;
cur_offset = drop_args.drop_end;
@@ -2957,8 +2874,9 @@ out:
return ret;
}
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
@@ -2975,11 +2893,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
+
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
- return ret;
+ goto out_only_mutex;
- btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
@@ -2990,9 +2909,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
- lockend = round_down(offset + len,
- btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
+ lockstart = round_up(offset, fs_info->sectorsize);
+ lockend = round_down(offset + len, fs_info->sectorsize) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -3067,10 +2989,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
- goto out_only_mutex;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
path = btrfs_alloc_path();
if (!path) {
@@ -3086,14 +3005,15 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
@@ -3196,7 +3116,7 @@ enum {
static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
struct extent_map *em;
int ret;
@@ -3226,14 +3146,12 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
+ const u64 sectorsize = fs_info->sectorsize;
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
bool space_reserved = false;
- inode_dio_wait(inode);
-
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
@@ -3363,23 +3281,21 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
- ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
- &cached_state);
- if (ret)
- goto out;
+ btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+ &cached_state);
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
offset + len, &alloc_hint);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
@@ -3412,8 +3328,11 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 alloc_hint = 0;
u64 locked_end;
u64 actual_end = 0;
+ u64 data_space_needed = 0;
+ u64 data_space_reserved = 0;
+ u64 qgroup_reserved = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
+ int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
int ret;
/* Do not allow fallocate in ZONED mode */
@@ -3430,19 +3349,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return btrfs_punch_hole(inode, offset, len);
-
- /*
- * Only trigger disk allocation, don't trigger qgroup reserve
- *
- * For qgroup space, it will be checked later.
- */
- if (!(mode & FALLOC_FL_ZERO_RANGE)) {
- ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
- alloc_end - alloc_start);
- if (ret < 0)
- return ret;
- }
+ return btrfs_punch_hole(file, offset, len);
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
@@ -3452,6 +3359,10 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
@@ -3476,8 +3387,12 @@ static long btrfs_fallocate(struct file *file, int mode,
}
/*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
+ * We have locked the inode at the VFS level (in exclusive mode) and we
+ * have locked the i_mmap_lock lock (in exclusive mode). Now before
+ * locking the file range, flush all dealloc in the range and wait for
+ * all ordered extents in the range to complete. After this we can lock
+ * the file range and, due to the previous locking we did, we know there
+ * can't be more delalloc or ordered extents in the range.
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
@@ -3491,38 +3406,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
- while (1) {
- struct btrfs_ordered_extent *ordered;
+ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
- /* the extent lock is ordered inside the running
- * transaction
- */
- lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
- locked_end);
-
- if (ordered &&
- ordered->file_offset + ordered->num_bytes > alloc_start &&
- ordered->file_offset < alloc_end) {
- btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- alloc_start, locked_end,
- &cached_state);
- /*
- * we can't wait on the range with the transaction
- * running or with the extent lock held
- */
- ret = btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
- if (ret)
- goto out;
- } else {
- if (ordered)
- btrfs_put_ordered_extent(ordered);
- break;
- }
- }
+ btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
@@ -3539,48 +3426,64 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = add_falloc_range(&reserve_list, cur_offset,
- last_byte - cur_offset);
+ const u64 range_len = last_byte - cur_offset;
+
+ ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
- &data_reserved, cur_offset,
- last_byte - cur_offset);
+ &data_reserved, cur_offset, range_len);
if (ret < 0) {
- cur_offset = last_byte;
free_extent_map(em);
break;
}
- } else {
- /*
- * Do not need to reserve unwritten extent for this
- * range, free reserved data space first, otherwise
- * it'll result in false ENOSPC error.
- */
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, cur_offset,
- last_byte - cur_offset);
+ qgroup_reserved += range_len;
+ data_space_needed += range_len;
}
free_extent_map(em);
cur_offset = last_byte;
}
+ if (!ret && data_space_needed > 0) {
+ /*
+ * We are safe to reserve space here as we can't have delalloc
+ * in the range, see above.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+ data_space_needed);
+ if (!ret)
+ data_space_reserved = data_space_needed;
+ }
+
/*
* If ret is still 0, means we're OK to fallocate.
* Or just cleanup the list and exit.
*/
list_for_each_entry_safe(range, tmp, &reserve_list, list) {
- if (!ret)
+ if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
- else
+ /*
+ * btrfs_prealloc_file_range() releases space even
+ * if it returns an error.
+ */
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (data_space_reserved > 0) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, range->start,
- range->len);
+ data_reserved, range->start,
+ range->len);
+ data_space_reserved -= range->len;
+ qgroup_reserved -= range->len;
+ } else if (qgroup_reserved > 0) {
+ btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
+ range->start, range->len);
+ qgroup_reserved -= range->len;
+ }
list_del(&range->list);
kfree(range);
}
@@ -3593,35 +3496,290 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
out:
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- /* Let go of our reservation. */
- if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
- btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
- cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
}
+/*
+ * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
+ * that has unflushed and/or flushing delalloc. There might be other adjacent
+ * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
+ * looping while it gets adjacent subranges, and merging them together.
+ */
+static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ const u64 len = end + 1 - start;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ u64 em_end;
+ u64 delalloc_len;
+
+ /*
+ * Search the io tree first for EXTENT_DELALLOC. If we find any, it
+ * means we have delalloc (dirty pages) for which writeback has not
+ * started yet.
+ */
+ *delalloc_start_ret = start;
+ delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1);
+ /*
+ * If delalloc was found then *delalloc_start_ret has a sector size
+ * aligned value (rounded down).
+ */
+ if (delalloc_len > 0)
+ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+
+ /*
+ * Now also check if there's any extent map in the range that does not
+ * map to a hole or prealloc extent. We do this because:
+ *
+ * 1) When delalloc is flushed, the file range is locked, we clear the
+ * EXTENT_DELALLOC bit from the io tree and create an extent map for
+ * an allocated extent. So we might just have been called after
+ * delalloc is flushed and before the ordered extent completes and
+ * inserts the new file extent item in the subvolume's btree;
+ *
+ * 2) We may have an extent map created by flushing delalloc for a
+ * subrange that starts before the subrange we found marked with
+ * EXTENT_DELALLOC in the io tree.
+ */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ read_unlock(&em_tree->lock);
+
+ /* extent_map_end() returns a non-inclusive end offset. */
+ em_end = em ? extent_map_end(em) : 0;
+
+ /*
+ * If we have a hole/prealloc extent map, check the next one if this one
+ * ends before our range's end.
+ */
+ if (em && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
+ struct extent_map *next_em;
+
+ read_lock(&em_tree->lock);
+ next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
+ read_unlock(&em_tree->lock);
+
+ free_extent_map(em);
+ em_end = next_em ? extent_map_end(next_em) : 0;
+ em = next_em;
+ }
+
+ if (em && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
+ /*
+ * No extent map or one for a hole or prealloc extent. Use the delalloc
+ * range we found in the io tree if we have one.
+ */
+ if (!em)
+ return (delalloc_len > 0);
+
+ /*
+ * We don't have any range as EXTENT_DELALLOC in the io tree, so the
+ * extent map is the only subrange representing delalloc.
+ */
+ if (delalloc_len == 0) {
+ *delalloc_start_ret = em->start;
+ *delalloc_end_ret = min(end, em_end - 1);
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map represents a delalloc range that starts before the
+ * delalloc range we found in the io tree.
+ */
+ if (em->start < *delalloc_start_ret) {
+ *delalloc_start_ret = em->start;
+ /*
+ * If the ranges are adjacent, return a combined range.
+ * Otherwise return the extent map's range.
+ */
+ if (em_end < *delalloc_start_ret)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+ }
+
+ /*
+ * The extent map starts after the delalloc range we found in the io
+ * tree. If it's adjacent, return a combined range, otherwise return
+ * the range found in the io tree.
+ */
+ if (*delalloc_end_ret + 1 == em->start)
+ *delalloc_end_ret = min(end, em_end - 1);
+
+ free_extent_map(em);
+ return true;
+}
+
+/*
+ * Check if there's delalloc in a given range.
+ *
+ * @inode: The inode.
+ * @start: The start offset of the range. It does not need to be
+ * sector size aligned.
+ * @end: The end offset (inclusive value) of the search range.
+ * It does not need to be sector size aligned.
+ * @delalloc_start_ret: Output argument, set to the start offset of the
+ * subrange found with delalloc (may not be sector size
+ * aligned).
+ * @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
+ * of the subrange found with delalloc.
+ *
+ * Returns true if a subrange with delalloc is found within the given range, and
+ * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
+ * end offsets of the subrange.
+ */
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+ u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+ u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
+ u64 prev_delalloc_end = 0;
+ bool ret = false;
+
+ while (cur_offset < end) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = find_delalloc_subrange(inode, cur_offset, end,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ if (prev_delalloc_end == 0) {
+ /* First subrange found. */
+ *delalloc_start_ret = max(delalloc_start, start);
+ *delalloc_end_ret = delalloc_end;
+ ret = true;
+ } else if (delalloc_start == prev_delalloc_end + 1) {
+ /* Subrange adjacent to the previous one, merge them. */
+ *delalloc_end_ret = delalloc_end;
+ } else {
+ /* Subrange not adjacent to the previous one, exit. */
+ break;
+ }
+
+ prev_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * Check if there's a hole or delalloc range in a range representing a hole (or
+ * prealloc extent) found in the inode's subvolume btree.
+ *
+ * @inode: The inode.
+ * @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
+ * @start: Start offset of the hole region. It does not need to be sector
+ * size aligned.
+ * @end: End offset (inclusive value) of the hole region. It does not
+ * need to be sector size aligned.
+ * @start_ret: Return parameter, used to set the start of the subrange in the
+ * hole that matches the search criteria (seek mode), if such
+ * subrange is found (return value of the function is true).
+ * The value returned here may not be sector size aligned.
+ *
+ * Returns true if a subrange matching the given seek mode is found, and if one
+ * is found, it updates @start_ret with the start of the subrange.
+ */
+static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+ u64 start, u64 end, u64 *start_ret)
+{
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+ &delalloc_start, &delalloc_end);
+ if (delalloc && whence == SEEK_DATA) {
+ *start_ret = delalloc_start;
+ return true;
+ }
+
+ if (delalloc && whence == SEEK_HOLE) {
+ /*
+ * We found delalloc but it starts after out start offset. So we
+ * have a hole between our start offset and the delalloc start.
+ */
+ if (start < delalloc_start) {
+ *start_ret = start;
+ return true;
+ }
+ /*
+ * Delalloc range starts at our start offset.
+ * If the delalloc range's length is smaller than our range,
+ * then it means we have a hole that starts where the delalloc
+ * subrange ends.
+ */
+ if (delalloc_end < end) {
+ *start_ret = delalloc_end + 1;
+ return true;
+ }
+
+ /* There's delalloc for the whole range. */
+ return false;
+ }
+
+ if (!delalloc && whence == SEEK_HOLE) {
+ *start_ret = start;
+ return true;
+ }
+
+ /*
+ * No delalloc in the range and we are seeking for data. The caller has
+ * to iterate to the next extent item in the subvolume btree.
+ */
+ return false;
+}
+
static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- loff_t i_size = inode->vfs_inode.i_size;
+ const loff_t i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 last_extent_end;
u64 lockstart;
u64 lockend;
u64 start;
- u64 len;
- int ret = 0;
+ int ret;
+ bool found = false;
if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
+ * Quick path. If the inode has no prealloc extents and its number of
+ * bytes used matches its i_size, then it can not have holes.
+ */
+ if (whence == SEEK_HOLE &&
+ !(inode->flags & BTRFS_INODE_PREALLOC) &&
+ inode_get_bytes(&inode->vfs_inode) == i_size)
+ return i_size;
+
+ /*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
@@ -3632,45 +3790,164 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
- len = lockend - lockstart + 1;
- lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = READA_FORWARD;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ last_extent_end = lockstart;
+
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
while (start < i_size) {
- em = btrfs_get_extent_fiemap(inode, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- em = NULL;
- break;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *extent;
+ u64 extent_end;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+
+ leaf = path->nodes[0];
}
- if (whence == SEEK_HOLE &&
- (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
- break;
- else if (whence == SEEK_DATA &&
- (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
- start = em->start + em->len;
- free_extent_map(em);
- em = NULL;
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * In the first iteration we may have a slot that points to an
+ * extent that ends before our start offset, so skip it.
+ */
+ if (extent_end <= start) {
+ path->slots[0]++;
+ continue;
+ }
+
+ /* We have an implicit hole, NO_HOLES feature is likely set. */
+ if (last_extent_end < key.offset) {
+ u64 search_start = last_extent_end;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ key.offset - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the extent.
+ */
+ }
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
+ btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ /*
+ * Explicit hole or prealloc extent, search for delalloc.
+ * A prealloc extent is treated like a hole.
+ */
+ u64 search_start = key.offset;
+ u64 found_start;
+
+ /*
+ * First iteration, @start matches @offset and it's
+ * within the hole.
+ */
+ if (start == offset)
+ search_start = offset;
+
+ found = find_desired_extent_in_hole(inode, whence,
+ search_start,
+ extent_end - 1,
+ &found_start);
+ if (found) {
+ start = found_start;
+ break;
+ }
+ /*
+ * Didn't find data or a hole (due to delalloc) in the
+ * implicit hole range, so need to analyze the next
+ * extent item.
+ */
+ } else {
+ /*
+ * Found a regular or inline extent.
+ * If we are seeking for data, adjust the start offset
+ * and stop, we're done.
+ */
+ if (whence == SEEK_DATA) {
+ start = max_t(u64, key.offset, offset);
+ found = true;
+ break;
+ }
+ /*
+ * Else, we are seeking for a hole, check the next file
+ * extent item.
+ */
+ }
+
+ start = extent_end;
+ last_extent_end = extent_end;
+ path->slots[0]++;
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
cond_resched();
}
- free_extent_map(em);
- unlock_extent_cached(&inode->io_tree, lockstart, lockend,
- &cached_state);
- if (ret) {
- offset = ret;
- } else {
- if (whence == SEEK_DATA && start >= i_size)
- offset = -ENXIO;
- else
- offset = min_t(loff_t, start, i_size);
+
+ /* We have an implicit hole from the last extent found up to i_size. */
+ if (!found && start < i_size) {
+ found = find_desired_extent_in_hole(inode, whence, start,
+ i_size - 1, &start);
+ if (!found)
+ start = i_size;
}
- return offset;
+out:
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_free_path(path);
+
+ if (ret < 0)
+ return ret;
+
+ if (whence == SEEK_DATA && start >= i_size)
+ return -ENXIO;
+
+ return min_t(loff_t, start, i_size);
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -3698,7 +3975,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
- filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
ret = fsverity_file_open(inode, filp);
if (ret)
@@ -3758,8 +4035,7 @@ again:
*/
pagefault_disable();
to->nofault = true;
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, read);
+ ret = btrfs_dio_rw(iocb, to, read);
to->nofault = false;
pagefault_enable();
@@ -3816,6 +4092,7 @@ const struct file_operations btrfs_file_operations = {
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
+ .get_unmapped_area = thp_get_unmapped_area,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 01a408db5683..f4023651dd68 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -48,6 +48,24 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, bool update_stats);
+static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+
+ while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info, true);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ free_bitmap(ctl, info);
+ }
+
+ cond_resched_lock(&ctl->tree_lock);
+ }
+}
+
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
@@ -126,10 +144,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
block_group->disk_cache_state = BTRFS_DC_CLEAR;
}
- if (!block_group->iref) {
+ if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
block_group->inode = igrab(inode);
- block_group->iref = 1;
- }
spin_unlock(&block_group->lock);
return inode;
@@ -241,8 +257,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
+ if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
block_group->inode = NULL;
spin_unlock(&block_group->lock);
iput(inode);
@@ -333,8 +348,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
btrfs_i_size_write(inode, 0);
truncate_pagecache(vfs_inode, 0);
- lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
- btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
/*
* We skip the throttling logic for free space cache inodes, so we don't
@@ -345,7 +360,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
+ unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
@@ -465,7 +480,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
io_ctl->pages[i] = page;
if (uptodate && !PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
@@ -693,6 +708,12 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
max_bitmaps = max_t(u64, max_bitmaps, 1);
+ if (ctl->total_bitmaps > max_bitmaps)
+ btrfs_err(block_group->fs_info,
+"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
+ block_group->start, block_group->length,
+ ctl->total_bitmaps, ctl->unit, max_bitmaps,
+ bytes_per_bg);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
/*
@@ -875,7 +896,10 @@ out:
return ret;
free_cache:
io_ctl_drop_pages(&io_ctl);
+
+ spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache(ctl);
+ spin_unlock(&ctl->tree_lock);
goto out;
}
@@ -914,6 +938,8 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
return ret;
}
+static struct lock_class_key btrfs_free_space_inode_key;
+
int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -983,6 +1009,14 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
}
spin_unlock(&block_group->lock);
+ /*
+ * Reinitialize the class of struct inode's mapping->invalidate_lock for
+ * free space inodes to prevent false positives related to locks for normal
+ * inodes.
+ */
+ lockdep_set_class(&(&inode->i_data)->invalidate_lock,
+ &btrfs_free_space_inode_key);
+
ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
path, block_group->start);
btrfs_free_path(path);
@@ -1001,7 +1035,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
if (ret == 0)
ret = 1;
} else {
+ /*
+ * We need to call the _locked variant so we don't try to update
+ * the discard counters.
+ */
+ spin_lock(&tmp_ctl.tree_lock);
__btrfs_remove_free_space_cache(&tmp_ctl);
+ spin_unlock(&tmp_ctl.tree_lock);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
block_group->start);
@@ -1123,7 +1163,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1135,8 +1175,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1, EXTENT_DELALLOC, 0,
- 0, NULL);
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1232,7 +1272,7 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, 0, 0, NULL);
+ EXTENT_DELALLOC, NULL);
return ret;
}
@@ -1252,8 +1292,8 @@ cleanup_write_cache_enospc(struct inode *inode,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1378,8 +1418,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (ret)
goto out_unlock;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1434,8 +1474,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
io_ctl_drop_pages(io_ctl);
io_ctl_free(io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
/*
* at this point the pages are under IO and we're happy,
@@ -2630,16 +2670,19 @@ out:
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *sinfo = block_group->space_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
- const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ int bg_reclaim_threshold = 0;
bool initial = (size == block_group->length);
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
+ if (!initial)
+ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
+
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
@@ -2857,7 +2900,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
if (btrfs_is_zoned(fs_info)) {
btrfs_info(fs_info, "free space %llu active %d",
block_group->zone_capacity - block_group->alloc_offset,
- block_group->zone_is_active);
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags));
return;
}
@@ -2961,34 +3005,6 @@ static void __btrfs_return_cluster_to_free_space(
btrfs_put_block_group(block_group);
}
-static void __btrfs_remove_free_space_cache_locked(
- struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_free_space *info;
- struct rb_node *node;
-
- while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
- info = rb_entry(node, struct btrfs_free_space, offset_index);
- if (!info->bitmap) {
- unlink_free_space(ctl, info, true);
- kmem_cache_free(btrfs_free_space_cachep, info);
- } else {
- free_bitmap(ctl, info);
- }
-
- cond_resched_lock(&ctl->tree_lock);
- }
-}
-
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
-{
- spin_lock(&ctl->tree_lock);
- __btrfs_remove_free_space_cache_locked(ctl);
- if (ctl->block_group)
- btrfs_discard_update_discardable(ctl->block_group);
- spin_unlock(&ctl->tree_lock);
-}
-
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3006,7 +3022,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
cond_resched_lock(&ctl->tree_lock);
}
- __btrfs_remove_free_space_cache_locked(ctl);
+ __btrfs_remove_free_space_cache(ctl);
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
@@ -3533,7 +3549,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
* data, keep it dense.
*/
if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
- cont1_bytes = min_bytes = bytes + empty_size;
+ cont1_bytes = bytes + empty_size;
+ min_bytes = cont1_bytes;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
cont1_bytes = bytes;
min_bytes = fs_info->sectorsize;
@@ -3988,7 +4005,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4018,7 +4035,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4040,7 +4057,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
*trimmed = 0;
spin_lock(&block_group->lock);
- if (block_group->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
@@ -4069,7 +4086,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info, "cleaning free space cache v1");
- node = rb_first(&fs_info->block_group_cache_tree);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group, cache_node);
ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 15591b299895..6d419ba53e95 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,7 +113,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 0ae54d8c10d6..367bcfcf68f5 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1178,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
goto abort;
}
- node = rb_first(&fs_info->block_group_cache_tree);
+ node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
@@ -1453,8 +1453,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
offset = key.objectid;
while (offset < key.objectid + key.offset) {
bit = free_space_test_bit(block_group, path, offset);
@@ -1490,8 +1488,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
@@ -1531,8 +1527,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
- caching_ctl->progress = key.objectid;
-
total_found += add_new_free_space(block_group, key.objectid,
key.objectid + key.offset);
if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1552,8 +1546,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
goto out;
}
- caching_ctl->progress = (u64)-1;
-
ret = 0;
out:
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6bfc4343c98d..45ebef8d3ea8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,8 +64,36 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
ssize_t submitted;
struct extent_changeset *data_reserved;
+ bool data_space_reserved;
+ bool nocow_done;
};
+struct btrfs_dio_private {
+ struct inode *inode;
+
+ /*
+ * Since DIO can use anonymous page, we cannot use page_offset() to
+ * grab the file offset, thus need a dedicated member for file offset.
+ */
+ u64 file_offset;
+ /* Used for bio::bi_size */
+ u32 bytes;
+
+ /*
+ * References to this structure. There is one reference per in-flight
+ * bio plus one while we're still setting up.
+ */
+ refcount_t refs;
+
+ /* Array of checksums */
+ u8 *csums;
+
+ /* This must be last */
+ struct bio bio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
@@ -86,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock);
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset);
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate);
-
/*
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
*
@@ -167,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
- u64 page_start = page_offset(locked_page);
- u64 page_end = page_start + PAGE_SIZE - 1;
-
+ u64 page_start, page_end;
struct page *page;
+ if (locked_page) {
+ page_start = page_offset(locked_page);
+ page_end = page_start + PAGE_SIZE - 1;
+ }
+
while (index <= end_index) {
/*
* For locked page, we will call end_extent_writepage() on it
@@ -184,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
- if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ if (locked_page && index == (page_start >> PAGE_SHIFT)) {
index++;
continue;
}
@@ -195,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
/*
* Here we just clear all Ordered bits for every page in the
- * range, then __endio_write_update_ordered() will handle
+ * range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
@@ -203,34 +230,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
put_page(page);
}
- /* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
- return;
- /*
- * In case this page belongs to the delalloc range being instantiated
- * then skip it, since the first page of a range is going to be
- * properly cleaned up by the caller of run_delalloc_range
- */
- if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
- offset = page_offset(locked_page) + PAGE_SIZE;
+ if (locked_page) {
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_start + PAGE_SIZE)
+ return;
+ /*
+ * In case this page belongs to the delalloc range being
+ * instantiated then skip it, since the first page of a range is
+ * going to be properly cleaned up by the caller of
+ * run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
+ }
}
- return __endio_write_update_ordered(inode, offset, bytes, false);
+ return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
+ struct btrfs_new_inode_args *args)
{
int err;
- err = btrfs_init_acl(trans, inode, dir);
- if (!err)
- err = btrfs_xattr_security_init(trans, inode, dir, qstr);
- return err;
+ if (args->default_acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ACL_TYPE_DEFAULT);
+ if (err)
+ return err;
+ }
+ if (args->acl) {
+ err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (err)
+ return err;
+ }
+ if (!args->default_acl && !args->acl)
+ cache_no_acl(args->inode);
+ return btrfs_xattr_security_init(trans, args->inode, args->dir,
+ &args->dentry->d_name);
}
/*
@@ -294,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = kmap_local_page(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
i++;
ptr += cur_size;
@@ -307,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
} else {
page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
write_extent_buffer(leaf, kaddr, ptr, size);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_page(page);
}
btrfs_mark_buffer_dirty(leaf);
@@ -447,7 +487,7 @@ struct async_chunk {
struct page *locked_page;
u64 start;
u64 end;
- unsigned int write_flags;
+ blk_opf_t write_flags;
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
@@ -481,17 +521,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
}
/*
- * Check if the inode has flags compatible with compression
- */
-static inline bool inode_can_compress(struct btrfs_inode *inode)
-{
- if (inode->flags & BTRFS_INODE_NODATACOW ||
- inode->flags & BTRFS_INODE_NODATASUM)
- return false;
- return true;
-}
-
-/*
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
@@ -500,7 +529,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- if (!inode_can_compress(inode)) {
+ if (!btrfs_inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
btrfs_ino(inode));
@@ -533,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
* will unlock the full page.
*/
if (fs_info->sectorsize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(end + 1, PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(end + 1))
return 0;
}
@@ -651,8 +680,8 @@ again:
* Thus we must also check against @actual_end, not just @end.
*/
if (blocksize < PAGE_SIZE) {
- if (!IS_ALIGNED(start, PAGE_SIZE) ||
- !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+ if (!PAGE_ALIGNED(start) ||
+ !PAGE_ALIGNED(round_up(actual_end, blocksize)))
goto cleanup_and_bail_uncompressed;
}
@@ -893,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
* can directly submit them without interruption.
*/
ret = cow_file_range(inode, locked_page, start, end, &page_started,
- &nr_written, 0);
+ &nr_written, 0, NULL);
/* Inline extent inserted, page gets unlocked and everything is done */
if (page_started) {
ret = 0;
goto out;
}
if (ret < 0) {
- if (locked_page)
+ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+ if (locked_page) {
+ const u64 page_start = page_offset(locked_page);
+ const u64 page_end = page_start + PAGE_SIZE - 1;
+
+ btrfs_page_set_error(inode->root->fs_info, locked_page,
+ page_start, PAGE_SIZE);
+ set_page_writeback(locked_page);
+ end_page_writeback(locked_page);
+ end_extent_writepage(locked_page, ret, page_start, page_end);
unlock_page(locked_page);
+ }
goto out;
}
@@ -938,7 +977,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
if (!(start >= locked_page_end || end <= locked_page_start))
locked_page = async_chunk->locked_page;
}
- lock_extent(io_tree, start, end);
+ lock_extent(io_tree, start, end, NULL);
/* We have fall back to uncompressed write */
if (!async_extent->pages)
@@ -985,7 +1024,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
1 << BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
- btrfs_drop_extent_cache(inode, start, end, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
goto out_free_reserve;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1106,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* *page_started is set to one if we unlock locked_page and do everything
* required to start IO on it. It may be clean and already done with
* IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ * - All the pages are unlocked. IO is started.
+ * - Note that this can happen only on success
+ * - unlock == 1
+ * - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ * - On success, all the pages are locked for writing out them
+ * - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock)
+ unsigned long *nr_written, int unlock,
+ u64 *done_offset)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
+ u64 orig_start = start;
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
@@ -1128,7 +1191,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
- WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
}
@@ -1192,7 +1254,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
- btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1257,8 +1318,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* skip current ordered extent.
*/
if (ret)
- btrfs_drop_extent_cache(inode, start,
- start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + ram_size - 1,
+ false);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1298,23 +1360,67 @@ out:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
+ btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
+ /*
+ * If done_offset is non-NULL and ret == -EAGAIN, we expect the
+ * caller to write out the successfully allocated region and retry.
+ */
+ if (done_offset && ret == -EAGAIN) {
+ if (orig_start < start)
+ *done_offset = start - 1;
+ else
+ *done_offset = start;
+ return ret;
+ } else if (ret == -EAGAIN) {
+ /* Convert to -ENOSPC since the caller cannot retry. */
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Now, we have three regions to clean up:
+ *
+ * |-------(1)----|---(2)---|-------------(3)----------|
+ * `- orig_start `- start `- start + cur_alloc_size `- end
+ *
+ * We process each region below.
+ */
+
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
+ /*
+ * For the range (1). We have already instantiated the ordered extents
+ * for this region. They are cleaned up by
+ * btrfs_cleanup_ordered_extents() in e.g,
+ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+ * function.
+ *
+ * However, in case of unlock == 0, we still need to unlock the pages
+ * (except @locked_page) to ensure all the pages are unlocked.
+ */
+ if (!unlock && orig_start < start) {
+ if (!locked_page)
+ mapping_set_error(inode->vfs_inode.i_mapping, ret);
+ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+ locked_page, 0, page_ops);
+ }
+
/*
- * If we reserved an extent for our delalloc range (or a subrange) and
- * failed to create the respective ordered extent, then it means that
- * when we reserved the extent we decremented the extent's size from
- * the data space_info's bytes_may_use counter and incremented the
- * space_info's bytes_reserved counter by the same amount. We must make
- * sure extent_clear_unlock_delalloc() does not try to decrement again
- * the data space_info's bytes_may_use counter, therefore we do not pass
- * it the flag EXTENT_CLEAR_DATA_RESV.
+ * For the range (2). If we reserved an extent for our delalloc range
+ * (or a subrange) and failed to create the respective ordered extent,
+ * then it means that when we reserved the extent we decremented the
+ * extent's size from the data space_info's bytes_may_use counter and
+ * incremented the space_info's bytes_reserved counter by the same
+ * amount. We must make sure extent_clear_unlock_delalloc() does not try
+ * to decrement again the data space_info's bytes_may_use counter,
+ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
@@ -1324,12 +1430,19 @@ out_unlock:
page_ops);
start += cur_alloc_size;
if (start >= end)
- goto out;
+ return ret;
}
+
+ /*
+ * For the range (3). We never touched the region. In addition to the
+ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+ * space_info's bytes_may_use counter, reserved in
+ * btrfs_check_data_free_space().
+ */
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits | EXTENT_CLEAR_DATA_RESV,
page_ops);
- goto out;
+ return ret;
}
/*
@@ -1409,9 +1522,9 @@ static int cow_file_range_async(struct btrfs_inode *inode,
int i;
bool should_compress;
unsigned nofs_flag;
- const unsigned int write_flags = wbc_to_write_flags(wbc);
+ const blk_opf_t write_flags = wbc_to_write_flags(wbc);
- unlock_extent(&inode->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, NULL);
if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
@@ -1512,26 +1625,48 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
u64 end, int *page_started,
unsigned long *nr_written)
{
+ u64 done_offset = end;
int ret;
+ bool locked_page_done = false;
- ret = cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 0);
- if (ret)
- return ret;
+ while (start <= end) {
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0, &done_offset);
+ if (ret && ret != -EAGAIN)
+ return ret;
- if (*page_started)
- return 0;
+ if (*page_started) {
+ ASSERT(ret == 0);
+ return 0;
+ }
+
+ if (ret == 0)
+ done_offset = end;
+
+ if (done_offset == start) {
+ wait_on_bit_io(&inode->root->fs_info->flags,
+ BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ continue;
+ }
+
+ if (!locked_page_done) {
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ }
+ locked_page_done = true;
+ extent_write_locked_range(&inode->vfs_inode, start, done_offset);
+
+ start = done_offset + 1;
+ }
- __set_page_dirty_nobuffers(locked_page);
- account_page_redirty(locked_page);
- extent_write_locked_range(&inode->vfs_inode, start, end);
*page_started = 1;
return 0;
}
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num_bytes)
+ u64 bytenr, u64 num_bytes, bool nowait)
{
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
struct btrfs_ordered_sum *sums;
@@ -1539,7 +1674,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
LIST_HEAD(list);
ret = btrfs_lookup_csums_range(csum_root, bytenr,
- bytenr + num_bytes - 1, &list, 0);
+ bytenr + num_bytes - 1, &list, 0,
+ nowait);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1612,11 +1748,148 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
if (count > 0)
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
- 0, 0, NULL);
+ NULL);
}
return cow_file_range(inode, locked_page, start, end, page_started,
- nr_written, 1);
+ nr_written, 1, NULL);
+}
+
+struct can_nocow_file_extent_args {
+ /* Input fields. */
+
+ /* Start file offset of the range we want to NOCOW. */
+ u64 start;
+ /* End file offset (inclusive) of the range we want to NOCOW. */
+ u64 end;
+ bool writeback_path;
+ bool strict;
+ /*
+ * Free the path passed to can_nocow_file_extent() once it's not needed
+ * anymore.
+ */
+ bool free_path;
+
+ /* Output fields. Only set when can_nocow_file_extent() returns 1. */
+
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ /* Number of bytes that can be written to in NOCOW mode. */
+ u64 num_bytes;
+};
+
+/*
+ * Check if we can NOCOW the file extent that the path points to.
+ * This function may return with the path released, so the caller should check
+ * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
+ *
+ * Returns: < 0 on error
+ * 0 if we can not NOCOW
+ * 1 if we can NOCOW
+ */
+static int can_nocow_file_extent(struct btrfs_path *path,
+ struct btrfs_key *key,
+ struct btrfs_inode *inode,
+ struct can_nocow_file_extent_args *args)
+{
+ const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+ u8 extent_type;
+ int can_nocow = 0;
+ int ret = 0;
+ bool nowait = path->nowait;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ extent_type = btrfs_file_extent_type(leaf, fi);
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ goto out;
+
+ /* Can't access these fields unless we know it's not an inline extent. */
+ args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
+ extent_type == BTRFS_FILE_EXTENT_REG)
+ goto out;
+
+ /*
+ * If the extent was created before the generation where the last snapshot
+ * for its subvolume was created, then this implies the extent is shared,
+ * hence we must COW.
+ */
+ if (!args->strict &&
+ btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ goto out;
+
+ /* An explicit hole, must COW. */
+ if (args->disk_bytenr == 0)
+ goto out;
+
+ /* Compressed/encrypted/encoded extents must be COWed. */
+ if (btrfs_file_extent_compression(leaf, fi) ||
+ btrfs_file_extent_encryption(leaf, fi) ||
+ btrfs_file_extent_other_encoding(leaf, fi))
+ goto out;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The following checks can be expensive, as they need to take other
+ * locks and do btree or rbtree searches, so release the path to avoid
+ * blocking other tasks for too long.
+ */
+ btrfs_release_path(path);
+
+ ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
+ key->offset - args->extent_offset,
+ args->disk_bytenr, false, path);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ if (args->free_path) {
+ /*
+ * We don't need the path anymore, plus through the
+ * csum_exist_in_range() call below we will end up allocating
+ * another path. So free the path to avoid unnecessary extra
+ * memory usage.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+ }
+
+ /* If there are pending snapshots for this root, we must COW. */
+ if (args->writeback_path && !is_freespace_inode &&
+ atomic_read(&root->snapshot_force_cow))
+ goto out;
+
+ args->disk_bytenr += args->extent_offset;
+ args->disk_bytenr += args->start - key->offset;
+ args->num_bytes = min(args->end + 1, extent_end) - args->start;
+
+ /*
+ * Force COW if csums exist in the range. This ensures that csums for a
+ * given extent are either valid or do not exist.
+ */
+ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
+ nowait);
+ WARN_ON_ONCE(ret > 0 && is_freespace_inode);
+ if (ret != 0)
+ goto out;
+
+ can_nocow = 1;
+ out:
+ if (args->free_path && path)
+ btrfs_free_path(path);
+
+ return ret < 0 ? ret : can_nocow;
}
/*
@@ -1639,11 +1912,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
u64 cur_offset = start;
int ret;
bool check_prev = true;
- const bool freespace_inode = btrfs_is_free_space_inode(inode);
u64 ino = btrfs_ino(inode);
+ struct btrfs_block_group *bg;
bool nocow = false;
- u64 disk_bytenr = 0;
- const bool force = inode->flags & BTRFS_INODE_NODATACOW;
+ struct can_nocow_file_extent_args nocow_args = { 0 };
path = btrfs_alloc_path();
if (!path) {
@@ -1656,15 +1928,16 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
return -ENOMEM;
}
+ nocow_args.end = end;
+ nocow_args.writeback_path = true;
+
while (1) {
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
u64 extent_end;
- u64 extent_offset;
- u64 num_bytes = 0;
- u64 disk_num_bytes;
u64 ram_bytes;
+ u64 nocow_end;
int extent_type;
nocow = false;
@@ -1740,116 +2013,38 @@ next_slot:
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
-
+ /* If this is triggered then we have a memory corruption. */
+ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
+ if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
+ ret = -EUCLEAN;
+ goto error;
+ }
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- if (extent_type == BTRFS_FILE_EXTENT_REG ||
- extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- extent_offset = btrfs_file_extent_offset(leaf, fi);
- extent_end = found_key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- disk_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf, fi);
- /*
- * If the extent we got ends before our current offset,
- * skip to the next extent.
- */
- if (extent_end <= cur_offset) {
- path->slots[0]++;
- goto next_slot;
- }
- /* Skip holes */
- if (disk_bytenr == 0)
- goto out_check;
- /* Skip compressed/encrypted/encoded extents */
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out_check;
- /*
- * If extent is created before the last volume's snapshot
- * this implies the extent is shared, hence we can't do
- * nocow. This is the same check as in
- * btrfs_cross_ref_exist but without calling
- * btrfs_search_slot.
- */
- if (!freespace_inode &&
- btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
- goto out_check;
- if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
- goto out_check;
+ extent_end = btrfs_file_extent_end(path);
- /*
- * The following checks can be expensive, as they need to
- * take other locks and do btree or rbtree searches, so
- * release the path to avoid blocking other tasks for too
- * long.
- */
- btrfs_release_path(path);
+ /*
+ * If the extent we got ends before our current offset, skip to
+ * the next extent.
+ */
+ if (extent_end <= cur_offset) {
+ path->slots[0]++;
+ goto next_slot;
+ }
- ret = btrfs_cross_ref_exist(root, ino,
- found_key.offset -
- extent_offset, disk_bytenr, false);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
+ nocow_args.start = cur_offset;
+ ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ } else if (ret == 0) {
+ goto out_check;
+ }
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- disk_bytenr += extent_offset;
- disk_bytenr += cur_offset - found_key.offset;
- num_bytes = min(end + 1, extent_end) - cur_offset;
- /*
- * If there are pending snapshots for this root, we
- * fall into common COW way
- */
- if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
- goto out_check;
- /*
- * force cow if csum exists in the range.
- * this ensure that csum for a given extent are
- * either valid or do not exist.
- */
- ret = csum_exist_in_range(fs_info, disk_bytenr,
- num_bytes);
- if (ret) {
- /*
- * ret could be -EIO if the above fails to read
- * metadata.
- */
- if (ret < 0) {
- if (cow_start != (u64)-1)
- cur_offset = cow_start;
- goto error;
- }
- WARN_ON_ONCE(freespace_inode);
- goto out_check;
- }
- /* If the extent's block group is RO, we must COW */
- if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
- goto out_check;
+ ret = 0;
+ bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ if (bg)
nocow = true;
- } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- extent_end = found_key.offset + ram_bytes;
- extent_end = ALIGN(extent_end, fs_info->sectorsize);
- /* Skip extents outside of our requested range */
- if (extent_end <= start) {
- path->slots[0]++;
- goto next_slot;
- }
- } else {
- /* If this triggers then we have a memory corruption */
- BUG();
- }
out_check:
/*
* If nocow is false then record the beginning of the range
@@ -1881,15 +2076,17 @@ out_check:
cow_start = (u64)-1;
}
+ nocow_end = cur_offset + nocow_args.num_bytes - 1;
+
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- u64 orig_start = found_key.offset - extent_offset;
+ u64 orig_start = found_key.offset - nocow_args.extent_offset;
struct extent_map *em;
- em = create_io_em(inode, cur_offset, num_bytes,
+ em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
orig_start,
- disk_bytenr, /* block_start */
- num_bytes, /* block_len */
- disk_num_bytes, /* orig_block_len */
+ nocow_args.disk_bytenr, /* block_start */
+ nocow_args.num_bytes, /* block_len */
+ nocow_args.disk_num_bytes, /* orig_block_len */
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
@@ -1898,20 +2095,23 @@ out_check:
}
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode,
- cur_offset, num_bytes, num_bytes,
- disk_bytenr, num_bytes, 0,
+ cur_offset, nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes, 0,
1 << BTRFS_ORDERED_PREALLOC,
BTRFS_COMPRESS_NONE);
if (ret) {
- btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset + num_bytes - 1,
- 0);
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ nocow_end, false);
goto error;
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
- num_bytes, num_bytes,
- disk_bytenr, num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.num_bytes,
+ nocow_args.disk_bytenr,
+ nocow_args.num_bytes,
0,
1 << BTRFS_ORDERED_NOCOW,
BTRFS_COMPRESS_NONE);
@@ -1919,9 +2119,10 @@ out_check:
goto error;
}
- if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
- nocow = false;
+ if (nocow) {
+ btrfs_dec_nocow_writers(bg);
+ nocow = false;
+ }
if (btrfs_is_data_reloc_root(root))
/*
@@ -1930,10 +2131,9 @@ out_check:
* from freeing metadata of created ordered extent.
*/
ret = btrfs_reloc_clone_csums(inode, cur_offset,
- num_bytes);
+ nocow_args.num_bytes);
- extent_clear_unlock_delalloc(inode, cur_offset,
- cur_offset + num_bytes - 1,
+ extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
@@ -1966,7 +2166,7 @@ out_check:
error:
if (nocow)
- btrfs_dec_nocow_writers(fs_info, disk_bytenr);
+ btrfs_dec_nocow_writers(bg);
if (ret && cur_offset < end)
extent_clear_unlock_delalloc(inode, cur_offset, end,
@@ -2017,18 +2217,17 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
* to use run_delalloc_nocow() here, like for regular
* preallocated inodes.
*/
- ASSERT(!zoned ||
- (zoned && btrfs_is_data_reloc_root(inode->root)));
+ ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
- } else if (!inode_can_compress(inode) ||
+ } else if (!btrfs_inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned)
ret = run_delalloc_zoned(inode, locked_page, start, end,
page_started, nr_written);
else
ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ page_started, nr_written, 1, NULL);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
@@ -2044,6 +2243,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 size;
/* not delalloc, ignore it */
@@ -2051,7 +2251,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
return;
size = orig->end - orig->start + 1;
- if (size > BTRFS_MAX_EXTENT_SIZE) {
+ if (size > fs_info->max_extent_size) {
u32 num_extents;
u64 new_size;
@@ -2060,10 +2260,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
* applies here, just in reverse.
*/
new_size = orig->end - split + 1;
- num_extents = count_max_extents(new_size);
+ num_extents = count_max_extents(fs_info, new_size);
new_size = split - orig->start;
- num_extents += count_max_extents(new_size);
- if (count_max_extents(size) >= num_extents)
+ num_extents += count_max_extents(fs_info, new_size);
+ if (count_max_extents(fs_info, size) >= num_extents)
return;
}
@@ -2080,6 +2280,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size, old_size;
u32 num_extents;
@@ -2093,7 +2294,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
new_size = other->end - new->start + 1;
/* we're not bigger than the max, unreserve the space and go */
- if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ if (new_size <= fs_info->max_extent_size) {
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
@@ -2119,10 +2320,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
* this case.
*/
old_size = other->end - other->start + 1;
- num_extents = count_max_extents(old_size);
+ num_extents = count_max_extents(fs_info, old_size);
old_size = new->end - new->start + 1;
- num_extents += count_max_extents(old_size);
- if (count_max_extents(new_size) >= num_extents)
+ num_extents += count_max_extents(fs_info, old_size);
+ if (count_max_extents(fs_info, new_size) >= num_extents)
return;
spin_lock(&BTRFS_I(inode)->lock);
@@ -2187,21 +2388,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* list of inodes that have pending delalloc work to be done.
*/
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
- unsigned *bits)
+ u32 bits)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+ if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
spin_lock(&BTRFS_I(inode)->lock);
@@ -2216,7 +2417,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
- if (*bits & EXTENT_DEFRAG)
+ if (bits & EXTENT_DEFRAG)
BTRFS_I(inode)->defrag_bytes += len;
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags))
@@ -2225,7 +2426,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
}
if (!(state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
state->start;
@@ -2238,14 +2439,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
* accounting happens.
*/
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
- struct extent_state *state, unsigned *bits)
+ struct extent_state *state, u32 bits)
{
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
- u32 num_extents = count_max_extents(len);
+ u32 num_extents = count_max_extents(fs_info, len);
- if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
+ if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
spin_unlock(&inode->lock);
@@ -2256,7 +2457,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = inode->root;
bool do_list = !btrfs_is_free_space_inode(inode);
@@ -2269,7 +2470,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
* don't need to call delalloc_release_metadata if there is an
* error.
*/
- if (*bits & EXTENT_CLEAR_META_RESV &&
+ if (bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len, false);
@@ -2279,7 +2480,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
if (!btrfs_is_data_reloc_root(root) &&
do_list && !(state->state & EXTENT_NORESERVE) &&
- (*bits & EXTENT_CLEAR_DATA_RESV))
+ (bits & EXTENT_CLEAR_DATA_RESV))
btrfs_free_reserved_data_space_noquota(fs_info, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
@@ -2294,11 +2495,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
}
if ((state->state & EXTENT_DELALLOC_NEW) &&
- (*bits & EXTENT_DELALLOC_NEW)) {
+ (bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
- if (*bits & EXTENT_ADD_INODE_BYTES)
+ if (bits & EXTENT_ADD_INODE_BYTES)
inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
@@ -2350,7 +2551,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
ASSERT(pre + post < len);
- lock_extent(&inode->io_tree, start, start + len - 1);
+ lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
@@ -2424,7 +2625,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
out_unlock:
write_unlock(&em_tree->lock);
- unlock_extent(&inode->io_tree, start, start + len - 1);
+ unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
out:
free_extent_map(split_pre);
free_extent_map(split_mid);
@@ -2493,100 +2694,75 @@ out:
return errno_to_blk_status(ret);
}
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read: sync submit
- *
- * b) write without checksum: sync submit
- *
- * c) write with checksum:
- * c-1) if bio is issued by fsync: sync submit
- * (sync_writers != 0)
- *
- * c-2) if root is reloc root: sync submit
- * (only in case of buffered IO)
- *
- * c-3) otherwise: async submit
- */
-blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
-
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
- blk_status_t ret = 0;
- int skip_sum;
- int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
- metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+ struct btrfs_inode *bi = BTRFS_I(inode);
+ blk_status_t ret;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct page *page = bio_first_bvec_all(bio)->bv_page;
- loff_t file_offset = page_offset(page);
-
- ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
- if (ret)
- goto out;
+ ret = extract_ordered_extent(bi, bio,
+ page_offset(bio_first_bvec_all(bio)->bv_page));
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
}
- if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
- if (ret)
- goto out;
+ /*
+ * If we need to checksum, and the I/O is not issued by fsync and
+ * friends, that is ->sync_writers != 0, defer the submission to a
+ * workqueue to parallelize it.
+ *
+ * Csum items for reloc roots have already been cloned at this point,
+ * so they are handled as part of the no-checksum case.
+ */
+ if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+ !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !btrfs_is_data_reloc_root(bi->root)) {
+ if (!atomic_read(&bi->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ btrfs_submit_bio_start))
+ return;
- if (bio_flags & EXTENT_BIO_COMPRESSED) {
- /*
- * btrfs_submit_compressed_read will handle completing
- * the bio if there were any errors, so just return
- * here.
- */
- ret = btrfs_submit_compressed_read(inode, bio,
- mirror_num,
- bio_flags);
- goto out_no_endio;
- } else {
- /*
- * Lookup bio sums does extra checks around whether we
- * need to csum or not, which is why we ignore skip_sum
- * here.
- */
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
- if (ret)
- goto out;
+ ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
}
- goto mapit;
- } else if (async && !skip_sum) {
- /* csum items have already been cloned */
- if (btrfs_is_data_reloc_root(root))
- goto mapit;
- /* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
- 0, btrfs_submit_bio_start);
- goto out;
- } else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
- if (ret)
- goto out;
}
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+}
-mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, enum btrfs_compression_type compress_type)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ blk_status_t ret;
-out:
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ /*
+ * btrfs_submit_compressed_read will handle completing the bio
+ * if there were any errors, so just return here.
+ */
+ btrfs_submit_compressed_read(inode, bio, mirror_num);
+ return;
+ }
+
+ /* Save the original iter for read repair */
+ btrfs_bio(bio)->iter = bio->bi_iter;
+
+ /*
+ * Lookup bio sums does extra checks around whether we need to csum or
+ * not, which is why we ignore skip_sum here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret) {
- bio->bi_status = ret;
- bio_endio(bio);
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
}
-out_no_endio:
- return ret;
+
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
/*
@@ -2642,8 +2818,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
ret = set_extent_bit(&inode->io_tree, search_start,
search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
- GFP_NOFS, NULL);
+ EXTENT_DELALLOC_NEW, cached_state,
+ GFP_NOFS);
next:
search_start = extent_map_end(em);
free_extent_map(em);
@@ -2755,7 +2931,7 @@ again:
if (ret)
goto out_page;
- lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PageOrdered(page))
@@ -2763,8 +2939,8 @@ again:
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
unlock_page(page);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -2790,8 +2966,7 @@ out_reserved:
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
@@ -2993,8 +3168,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
- num_bytes = ram_bytes = oe->truncated_len;
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ num_bytes = oe->truncated_len;
+ ram_bytes = num_bytes;
+ }
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3020,7 +3197,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
struct btrfs_root *root = inode->root;
@@ -3047,6 +3224,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
@@ -3091,7 +3270,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
clear_bits |= EXTENT_LOCKED;
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -3113,6 +3292,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
+ btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
@@ -3145,7 +3326,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
clear_extent_bit(&inode->io_tree, start, end,
EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
- 0, 0, &cached_state);
+ &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
@@ -3156,7 +3337,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
- (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
&cached_state);
if (trans)
@@ -3181,8 +3361,8 @@ out:
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
+ /* Drop extent maps for the part of the extent we didn't write. */
+ btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -3227,71 +3407,82 @@ out:
return ret;
}
-static void finish_ordered_fn(struct btrfs_work *work)
-{
- struct btrfs_ordered_extent *ordered_extent;
- ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
- btrfs_finish_ordered_io(ordered_extent);
-}
-
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
- finish_ordered_fn, uptodate);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
+}
+
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ */
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ u32 pgoff, u8 *csum, const u8 * const csum_expected)
+{
+ SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+ char *kaddr;
+
+ ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+
+ shash->tfm = fs_info->csum_shash;
+
+ kaddr = kmap_local_page(page) + pgoff;
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ kunmap_local(kaddr);
+
+ if (memcmp(csum, csum_expected, fs_info->csum_size))
+ return -EIO;
+ return 0;
+}
+
+static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
+{
+ u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+
+ return csums + offset_in_sectors * fs_info->csum_size;
}
/*
* check_data_csum - verify checksum of one sector of uncompressed data
* @inode: inode
- * @io_bio: btrfs_io_bio which contains the csum
+ * @bbio: btrfs_bio which contains the csum
* @bio_offset: offset to the beginning of the bio (in bytes)
* @page: page where is the data to be verified
* @pgoff: offset inside the page
- * @start: logical offset in the file
*
* The length of such check is always one sector size.
+ *
+ * When csum mismatch is detected, we will also report the error and fill the
+ * corrupted range with zero. (Thus it needs the extra parameters)
*/
-static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
- u32 bio_offset, struct page *page, u32 pgoff,
- u64 start)
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+ u32 bio_offset, struct page *page, u32 pgoff)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
u32 len = fs_info->sectorsize;
- const u32 csum_size = fs_info->csum_size;
- unsigned int offset_sectors;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
ASSERT(pgoff + len <= PAGE_SIZE);
- offset_sectors = bio_offset >> fs_info->sectorsize_bits;
- csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
+ csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
- kaddr = kmap_atomic(page);
- shash->tfm = fs_info->csum_shash;
-
- crypto_shash_digest(shash, kaddr + pgoff, len, csum);
-
- if (memcmp(csum, csum_expected, csum_size))
+ if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
goto zeroit;
-
- kunmap_atomic(kaddr);
return 0;
+
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- bbio->mirror_num);
+ btrfs_print_data_csum_error(BTRFS_I(inode),
+ bbio->file_offset + bio_offset,
+ csum, csum_expected, bbio->mirror_num);
if (bbio->device)
btrfs_dev_stat_inc_and_print(bbio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
- memset(kaddr + pgoff, 1, len);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
+ memzero_page(page, pgoff, len);
return -EIO;
}
@@ -3319,11 +3510,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 pg_off;
unsigned int result = 0;
- if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
- btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
- return 0;
- }
-
/*
* This only happens for NODATASUM or compressed read.
* Normally this should be covered by above check for compressed read
@@ -3356,8 +3542,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
EXTENT_NODATASUM);
continue;
}
- ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
- page_offset(page) + pg_off);
+ ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
root->fs_info->sectorsize_bits;
@@ -4146,7 +4331,7 @@ skip_backref:
/*
* If we are in a rename context, we don't need to update anything in the
* log. That will be done later during the rename by btrfs_log_new_name().
- * Besides that, doing it here would only cause extra unncessary btree
+ * Besides that, doing it here would only cause extra unnecessary btree
* operations on the log tree, increasing latency for applications.
*/
if (!rename_ctx) {
@@ -4174,8 +4359,9 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
- dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
+ dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
@@ -4212,8 +4398,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the dir index
* 1 for the inode ref
* 1 for the inode
+ * 1 for the parent inode
*/
- return btrfs_start_transaction_fallback_global_rsv(root, 5);
+ return btrfs_start_transaction_fallback_global_rsv(root, 6);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4336,7 +4523,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime;
ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -4488,6 +4676,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
return -EPERM;
}
+ if (atomic_read(&dest->nr_swapfiles)) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu with active swapfile",
+ root->root_key.objectid);
+ return -EPERM;
+ }
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -4690,16 +4885,16 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
- blocksize);
+ blocksize, false);
if (ret < 0) {
- if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
@@ -4720,7 +4915,7 @@ again:
goto out_unlock;
if (!PageUptodate(page)) {
- ret = btrfs_readpage(NULL, page);
+ ret = btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != mapping) {
unlock_page(page);
@@ -4734,12 +4929,11 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, block_start, block_end, &cached_state);
+ lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
unlock_page(page);
put_page(page);
btrfs_start_ordered_extent(ordered, 1);
@@ -4749,13 +4943,12 @@ again:
clear_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state);
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent_cached(io_tree, block_start, block_end,
- &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
@@ -4768,16 +4961,15 @@ again:
else
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
- flush_dcache_page(page);
}
btrfs_page_clear_checked(fs_info, page, block_start,
block_end + 1 - block_start);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
- unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
+ EXTENT_NORESERVE, NULL, GFP_NOFS);
out_unlock:
if (ret) {
@@ -4834,8 +5026,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
return ret;
}
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
- offset, 0, 0, len, 0, len, 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
if (ret) {
btrfs_abort_transaction(trans, ret);
} else {
@@ -4859,7 +5050,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
@@ -4907,10 +5097,11 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (err)
break;
- btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
+ btrfs_drop_extent_map_range(inode, cur_offset,
+ cur_offset + hole_size - 1,
+ false);
btrfs_set_inode_full_sync(inode);
goto next;
}
@@ -4925,16 +5116,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
- while (1) {
- write_lock(&em_tree->lock);
- err = add_extent_mapping(em_tree, hole_em, 1);
- write_unlock(&em_tree->lock);
- if (err != -EEXIST)
- break;
- btrfs_drop_extent_cache(inode, cur_offset,
- cur_offset +
- hole_size - 1, 0);
- }
+ err = btrfs_replace_extent_map_range(inode, hole_em, true);
free_extent_map(hole_em);
} else {
err = btrfs_inode_set_file_extent_range(inode,
@@ -4950,7 +5132,7 @@ next:
break;
}
free_extent_map(em);
- unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
+ unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
return err;
}
@@ -4971,9 +5153,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
*/
if (newsize != oldsize) {
inode_inc_iversion(inode);
- if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
- inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
}
if (newsize > oldsize) {
@@ -5083,7 +5266,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
* While truncating the inode pages during eviction, we get the VFS
* calling btrfs_invalidate_folio() against each folio of the inode. This
* is slow because the calls to btrfs_invalidate_folio() result in a
- * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * huge amount of calls to lock_extent() and clear_extent_bit(),
* which keep merging and splitting extent_state structures over and over,
* wasting lots of time.
*
@@ -5095,29 +5278,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
static void evict_inode_truncate_pages(struct inode *inode)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
struct rb_node *node;
ASSERT(inode->i_state & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
- write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
- struct extent_map *em;
-
- node = rb_first_cached(&map_tree->map);
- em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- remove_extent_mapping(map_tree, em);
- free_extent_map(em);
- if (need_resched()) {
- write_unlock(&map_tree->lock);
- cond_resched();
- write_lock(&map_tree->lock);
- }
- }
- write_unlock(&map_tree->lock);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* Keep looping until we have no more ranges in the io tree.
@@ -5150,7 +5316,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5165,8 +5331,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
end - start + 1);
clear_extent_bit(io_tree, start, end,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
&cached_state);
cond_resched();
@@ -5281,7 +5446,7 @@ void btrfs_evict_inode(struct inode *inode)
if (!rsv)
goto no_delete;
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = 1;
+ rsv->failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5519,6 +5684,11 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
BUG_ON(args->root && !BTRFS_I(inode)->root);
+
+ if (args->root && args->root == args->root->fs_info->tree_root &&
+ args->ino != BTRFS_BTREE_INODE_OBJECTID)
+ set_bit(BTRFS_INODE_FREE_SPACE_INODE,
+ &BTRFS_I(inode)->runtime_flags);
return 0;
}
@@ -5673,14 +5843,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret != -ENOENT)
inode = ERR_PTR(ret);
else
- inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ inode = new_simple_dir(dir->i_sb, &location, root);
} else {
inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
- }
- if (root != sub_root)
btrfs_put_root(sub_root);
- if (!IS_ERR(inode) && root != sub_root) {
+ if (IS_ERR(inode))
+ return inode;
+
down_read(&fs_info->cleanup_work_sem);
if (!sb_rdonly(inode->i_sb))
ret = btrfs_orphan_cleanup(sub_root);
@@ -5786,8 +5956,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct list_head ins_list;
struct list_head del_list;
int ret;
- struct extent_buffer *leaf;
- int slot;
char *name_ptr;
int name_len;
int entries = 0;
@@ -5814,35 +5982,19 @@ again:
key.offset = ctx->pos;
key.objectid = btrfs_ino(BTRFS_I(inode));
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, ret) {
struct dir_entry *entry;
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
+ struct extent_buffer *leaf = path->nodes[0];
if (found_key.objectid != key.objectid)
break;
if (found_key.type != BTRFS_DIR_INDEX_KEY)
break;
if (found_key.offset < ctx->pos)
- goto next;
+ continue;
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
- goto next;
- di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ continue;
+ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
name_len = btrfs_dir_name_len(leaf, di);
if ((total_len + sizeof(struct dir_entry) + name_len) >=
PAGE_SIZE) {
@@ -5869,9 +6021,11 @@ again:
entries++;
addr += sizeof(struct dir_entry) + name_len;
total_len += sizeof(struct dir_entry) + name_len;
-next:
- path->slots[0]++;
}
+ /* Catch error encountered during iteration */
+ if (ret < 0)
+ goto err;
+
btrfs_release_path(path);
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
@@ -6059,6 +6213,57 @@ static int btrfs_insert_inode_locked(struct inode *inode)
btrfs_find_actor, &args);
}
+int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
+ unsigned int *trans_num_items)
+{
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ int ret;
+
+ ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
+ if (ret)
+ return ret;
+
+ /* 1 to add inode item */
+ *trans_num_items = 1;
+ /* 1 to add compression property */
+ if (BTRFS_I(dir)->prop_compress)
+ (*trans_num_items)++;
+ /* 1 to add default ACL xattr */
+ if (args->default_acl)
+ (*trans_num_items)++;
+ /* 1 to add access ACL xattr */
+ if (args->acl)
+ (*trans_num_items)++;
+#ifdef CONFIG_SECURITY
+ /* 1 to add LSM xattr */
+ if (dir->i_security)
+ (*trans_num_items)++;
+#endif
+ if (args->orphan) {
+ /* 1 to add orphan item */
+ (*trans_num_items)++;
+ } else {
+ /*
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
+ *
+ * No need for 1 unit for the inode ref item because it is
+ * inserted in a batch together with the inode item at
+ * btrfs_create_new_inode().
+ */
+ *trans_num_items += 3;
+ }
+ return 0;
+}
+
+void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
+{
+ posix_acl_release(args->acl);
+ posix_acl_release(args->default_acl);
+}
+
/*
* Inherit flags from the parent inode.
*
@@ -6068,9 +6273,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
unsigned int flags;
- if (!dir)
- return;
-
flags = BTRFS_I(dir)->flags;
if (flags & BTRFS_INODE_NOCOMPRESS) {
@@ -6090,76 +6292,86 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
btrfs_sync_inode_flags_to_i_flags(inode);
}
-static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct user_namespace *mnt_userns,
- struct inode *dir,
- const char *name, int name_len,
- u64 ref_objectid, u64 objectid,
- umode_t mode, u64 *index)
+int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_new_inode_args *args)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct inode *dir = args->dir;
+ struct inode *inode = args->inode;
+ const char *name = args->orphan ? NULL : args->dentry->d_name.name;
+ int name_len = args->orphan ? 0 : args->dentry->d_name.len;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
+ struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
struct btrfs_path *path;
+ u64 objectid;
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
struct btrfs_item_batch batch;
unsigned long ptr;
- unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
-
- nofs_flag = memalloc_nofs_save();
- inode = new_inode(fs_info->sb);
- memalloc_nofs_restore(nofs_flag);
- if (!inode) {
- btrfs_free_path(path);
- return ERR_PTR(-ENOMEM);
- }
+ return -ENOMEM;
- /*
- * O_TMPFILE, set link count to 0, so that after this point,
- * we fill in an inode item with the correct link count.
- */
- if (!name)
- set_nlink(inode, 0);
+ if (!args->subvol)
+ BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
+ root = BTRFS_I(inode)->root;
- /*
- * we have to initialize this early, so we can reclaim the inode
- * number if we fail afterwards in this function.
- */
+ ret = btrfs_get_free_objectid(root, &objectid);
+ if (ret)
+ goto out;
inode->i_ino = objectid;
- if (dir && name) {
+ if (args->orphan) {
+ /*
+ * O_TMPFILE, set link count to 0, so that after this point, we
+ * fill in an inode item with the correct link count.
+ */
+ set_nlink(inode, 0);
+ } else {
trace_btrfs_inode_request(dir);
- ret = btrfs_set_inode_index(BTRFS_I(dir), index);
- if (ret) {
- btrfs_free_path(path);
- iput(inode);
- return ERR_PTR(ret);
- }
- } else if (dir) {
- *index = 0;
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
+ if (ret)
+ goto out;
}
- /*
- * index_cnt is ignored for everything but a dir,
- * btrfs_set_inode_index_count has an explanation for the magic
- * number
- */
- BTRFS_I(inode)->index_cnt = 2;
- BTRFS_I(inode)->dir_index = *index;
- BTRFS_I(inode)->root = btrfs_grab_root(root);
+ /* index_cnt is ignored for everything but a dir. */
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * Subvolumes don't inherit flags from their parent directory.
+ * Originally this was probably by accident, but we probably can't
+ * change it now without compatibility issues.
+ */
+ if (!args->subvol)
+ btrfs_inherit_iflags(inode, dir);
+
+ if (S_ISREG(inode->i_mode)) {
+ if (btrfs_test_opt(fs_info, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(fs_info, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
+ }
+
+ location = &BTRFS_I(inode)->location;
+ location->objectid = objectid;
+ location->offset = 0;
+ location->type = BTRFS_INODE_ITEM_KEY;
+
+ ret = btrfs_insert_inode_locked(inode);
+ if (ret < 0) {
+ if (!args->orphan)
+ BTRFS_I(dir)->index_cnt--;
+ goto out;
+ }
+
+ /*
* We could have gotten an inode number from somebody who was fsynced
* and then removed in this same transaction, so let's just set full
* sync since it will be a full sync anyway and this will blow away the
@@ -6173,7 +6385,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[0] = sizeof(struct btrfs_inode_item);
- if (name) {
+ if (!args->orphan) {
/*
* Start new inodes with an inode_ref. This is slightly more
* efficient for small numbers of hard links since they will
@@ -6182,64 +6394,101 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
key[1].objectid = objectid;
key[1].type = BTRFS_INODE_REF_KEY;
- key[1].offset = ref_objectid;
-
- sizes[1] = name_len + sizeof(*ref);
- }
-
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
- ret = btrfs_insert_inode_locked(inode);
- if (ret < 0) {
- iput(inode);
- goto fail;
+ if (args->subvol) {
+ key[1].offset = objectid;
+ sizes[1] = 2 + sizeof(*ref);
+ } else {
+ key[1].offset = btrfs_ino(BTRFS_I(dir));
+ sizes[1] = name_len + sizeof(*ref);
+ }
}
batch.keys = &key[0];
batch.data_sizes = &sizes[0];
- batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
- batch.nr = name ? 2 : 1;
+ batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
+ batch.nr = args->orphan ? 1 : 2;
ret = btrfs_insert_empty_items(trans, root, path, &batch);
- if (ret != 0)
- goto fail_unlock;
-
- inode_init_owner(mnt_userns, inode, dir, mode);
- inode_set_bytes(inode, 0);
+ if (ret != 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
+ /*
+ * We're going to fill the inode item now, so at this point the inode
+ * must be fully initialized.
+ */
+
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
- if (name) {
+ if (!args->orphan) {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
ptr = (unsigned long)(ref + 1);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ if (args->subvol) {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
+ write_extent_buffer(path->nodes[0], "..", ptr, 2);
+ } else {
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref,
+ BTRFS_I(inode)->dir_index);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
}
btrfs_mark_buffer_dirty(path->nodes[0]);
+ /*
+ * We don't need the path anymore, plus inheriting properties, adding
+ * ACLs, security xattrs, orphan item or adding the link, will result in
+ * allocating yet another path. So just free our path.
+ */
btrfs_free_path(path);
+ path = NULL;
- btrfs_inherit_iflags(inode, dir);
+ if (args->subvol) {
+ struct inode *parent;
- if (S_ISREG(mode)) {
- if (btrfs_test_opt(fs_info, NODATASUM))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(fs_info, NODATACOW))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ /*
+ * Subvolumes inherit properties from their parent subvolume,
+ * not the directory they were created in.
+ */
+ parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_I(dir)->root);
+ if (IS_ERR(parent)) {
+ ret = PTR_ERR(parent);
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, parent);
+ iput(parent);
+ }
+ } else {
+ ret = btrfs_inode_inherit_props(trans, inode, dir);
+ }
+ if (ret) {
+ btrfs_err(fs_info,
+ "error inheriting props for ino %llu (root %llu): %d",
+ btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
+ ret);
+ }
+
+ /*
+ * Subvolumes don't inherit ACLs or get passed to the LSM. This is
+ * probably a bug.
+ */
+ if (!args->subvol) {
+ ret = btrfs_init_inode_security(trans, args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
inode_tree_add(inode);
@@ -6249,21 +6498,29 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_update_root_times(trans, root);
- ret = btrfs_inode_inherit_props(trans, inode, dir);
- if (ret)
- btrfs_err(fs_info,
- "error inheriting props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
+ if (args->orphan) {
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ } else {
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
+ name_len, 0, BTRFS_I(inode)->dir_index);
+ }
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
- return inode;
+ return 0;
-fail_unlock:
+discard:
+ /*
+ * discard_new_inode() calls iput(), but the caller owns the reference
+ * to the inode.
+ */
+ ihold(inode);
discard_new_inode(inode);
-fail:
- if (dir && name)
- BTRFS_I(dir)->index_cnt--;
+out:
btrfs_free_path(path);
- return ERR_PTR(ret);
+ return ret;
}
/*
@@ -6355,147 +6612,71 @@ fail_dir_item:
return ret;
}
-static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
- struct btrfs_inode *dir, struct dentry *dentry,
- struct btrfs_inode *inode, int backref, u64 index)
-{
- int err = btrfs_add_link(trans, dir, inode,
- dentry->d_name.name, dentry->d_name.len,
- backref, index);
- if (err > 0)
- err = -EEXIST;
- return err;
-}
-
-static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t rdev)
+static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .inode = inode,
+ };
+ unsigned int trans_num_items;
+ struct btrfs_trans_handle *trans;
int err;
- u64 objectid;
- u64 index = 0;
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- err = btrfs_get_free_objectid(root, &objectid);
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- btrfs_update_inode(trans, root, BTRFS_I(inode));
- d_instantiate_new(dentry, inode);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!err)
+ d_instantiate_new(dentry, inode);
-out_unlock:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
-static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
+static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- int err;
- u64 objectid;
- u64 index = 0;
+ struct inode *inode;
- /*
- * 2 for inode item and ref
- * 2 for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ return btrfs_create_common(dir, dentry, inode);
+}
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
- goto out_unlock;
+static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ struct inode *inode;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
- }
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_unlock;
-
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (err)
- goto out_unlock;
-
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 0, index);
- if (err)
- goto out_unlock;
-
- d_instantiate_new(dentry, inode);
-
-out_unlock:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6541,8 +6722,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
- 1, index);
+ err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ dentry->d_name.name, dentry->d_name.len, 1, index);
if (err) {
drop_inode = 1;
@@ -6579,66 +6760,15 @@ fail:
static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct inode *inode = NULL;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(dir)->root;
- int err = 0;
- u64 objectid = 0;
- u64 index = 0;
-
- /*
- * 2 items for inode and ref
- * 2 items for dir items
- * 1 for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- err = btrfs_get_free_objectid(root, &objectid);
- if (err)
- goto out_fail;
-
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid,
- S_IFDIR | mode, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_fail;
- }
+ struct inode *inode;
- /* these must be set before we unlock the inode */
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
- if (err)
- goto out_fail;
-
- btrfs_i_size_write(BTRFS_I(inode), 0);
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (err)
- goto out_fail;
-
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- dentry->d_name.name,
- dentry->d_name.len, 0, index);
- if (err)
- goto out_fail;
-
- d_instantiate_new(dentry, inode);
-
-out_fail:
- btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
- btrfs_btree_balance_dirty(fs_info);
- return err;
+ return btrfs_create_common(dir, dentry, inode);
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -6719,7 +6849,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_io_tree *io_tree = &inode->io_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -6882,8 +7011,6 @@ next:
}
flush_dcache_page(page);
}
- set_extent_uptodate(io_tree, em->start,
- extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
}
not_found:
@@ -6917,133 +7044,6 @@ out:
return em;
}
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
- u64 start, u64 len)
-{
- struct extent_map *em;
- struct extent_map *hole_em = NULL;
- u64 delalloc_start = start;
- u64 end;
- u64 delalloc_len;
- u64 delalloc_end;
- int err = 0;
-
- em = btrfs_get_extent(inode, NULL, 0, start, len);
- if (IS_ERR(em))
- return em;
- /*
- * If our em maps to:
- * - a hole or
- * - a pre-alloc extent,
- * there might actually be delalloc bytes behind it.
- */
- if (em->block_start != EXTENT_MAP_HOLE &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return em;
- else
- hole_em = em;
-
- /* check to see if we've wrapped (len == -1 or similar) */
- end = start + len;
- if (end < start)
- end = (u64)-1;
- else
- end -= 1;
-
- em = NULL;
-
- /* ok, we didn't find anything, lets look for delalloc */
- delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
- end, len, EXTENT_DELALLOC, 1);
- delalloc_end = delalloc_start + delalloc_len;
- if (delalloc_end < delalloc_start)
- delalloc_end = (u64)-1;
-
- /*
- * We didn't find anything useful, return the original results from
- * get_extent()
- */
- if (delalloc_start > end || delalloc_end <= start) {
- em = hole_em;
- hole_em = NULL;
- goto out;
- }
-
- /*
- * Adjust the delalloc_start to make sure it doesn't go backwards from
- * the start they passed in
- */
- delalloc_start = max(start, delalloc_start);
- delalloc_len = delalloc_end - delalloc_start;
-
- if (delalloc_len > 0) {
- u64 hole_start;
- u64 hole_len;
- const u64 hole_end = extent_map_end(hole_em);
-
- em = alloc_extent_map();
- if (!em) {
- err = -ENOMEM;
- goto out;
- }
-
- ASSERT(hole_em);
- /*
- * When btrfs_get_extent can't find anything it returns one
- * huge hole
- *
- * Make sure what it found really fits our range, and adjust to
- * make sure it is based on the start from the caller
- */
- if (hole_end <= start || hole_em->start > end) {
- free_extent_map(hole_em);
- hole_em = NULL;
- } else {
- hole_start = max(hole_em->start, start);
- hole_len = hole_end - hole_start;
- }
-
- if (hole_em && delalloc_start > hole_start) {
- /*
- * Our hole starts before our delalloc, so we have to
- * return just the parts of the hole that go until the
- * delalloc starts
- */
- em->len = min(hole_len, delalloc_start - hole_start);
- em->start = hole_start;
- em->orig_start = hole_start;
- /*
- * Don't adjust block start at all, it is fixed at
- * EXTENT_MAP_HOLE
- */
- em->block_start = hole_em->block_start;
- em->block_len = hole_len;
- if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- } else {
- /*
- * Hole is out of passed range or it starts after
- * delalloc range
- */
- em->start = delalloc_start;
- em->len = delalloc_len;
- em->orig_start = delalloc_start;
- em->block_start = EXTENT_MAP_DELALLOC;
- em->block_len = delalloc_len;
- }
- } else {
- return hole_em;
- }
-out:
-
- free_extent_map(hole_em);
- if (err) {
- free_extent_map(em);
- return ERR_PTR(err);
- }
- return em;
-}
-
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
@@ -7073,7 +7073,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (ret) {
if (em) {
free_extent_map(em);
- btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+ btrfs_drop_extent_map_range(inode, start,
+ start + len - 1, false);
}
em = ERR_PTR(ret);
}
@@ -7144,9 +7145,10 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool strict)
+ u64 *ram_bytes, bool nowait, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct can_nocow_file_extent_args nocow_args = { 0 };
struct btrfs_path *path;
int ret;
struct extent_buffer *leaf;
@@ -7154,35 +7156,29 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 disk_bytenr;
- u64 backref_offset;
- u64 extent_end;
- u64 num_bytes;
- int slot;
int found_type;
- bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), offset, 0);
if (ret < 0)
goto out;
- slot = path->slots[0];
if (ret == 1) {
- if (slot == 0) {
+ if (path->slots[0] == 0) {
/* can't find the item, must cow */
ret = 0;
goto out;
}
- slot--;
+ path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
@@ -7194,55 +7190,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
goto out;
}
- fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- found_type = btrfs_file_extent_type(leaf, fi);
- if (found_type != BTRFS_FILE_EXTENT_REG &&
- found_type != BTRFS_FILE_EXTENT_PREALLOC) {
- /* not a regular extent, must cow */
- goto out;
- }
-
- if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+ if (btrfs_file_extent_end(path) <= offset)
goto out;
- extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
- if (extent_end <= offset)
- goto out;
+ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (ram_bytes)
+ *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (disk_bytenr == 0)
- goto out;
+ nocow_args.start = offset;
+ nocow_args.end = offset + *len - 1;
+ nocow_args.strict = strict;
+ nocow_args.free_path = true;
- if (btrfs_file_extent_compression(leaf, fi) ||
- btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- goto out;
+ ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ /* can_nocow_file_extent() has freed the path. */
+ path = NULL;
- /*
- * Do the same check as in btrfs_cross_ref_exist but without the
- * unnecessary search.
- */
- if (!strict &&
- (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item)))
+ if (ret != 1) {
+ /* Treat errors as not being able to NOCOW. */
+ ret = 0;
goto out;
-
- backref_offset = btrfs_file_extent_offset(leaf, fi);
-
- if (orig_start) {
- *orig_start = key.offset - backref_offset;
- *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
- if (btrfs_extent_readonly(fs_info, disk_bytenr))
+ ret = 0;
+ if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
goto out;
- num_bytes = min(offset + *len, extent_end) - offset;
- if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + num_bytes,
+ range_end = round_up(offset + nocow_args.num_bytes,
root->fs_info->sectorsize) - 1;
ret = test_range_bit(io_tree, offset, range_end,
EXTENT_DELALLOC, 0, NULL);
@@ -7252,36 +7231,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
}
- btrfs_release_path(path);
+ if (orig_start)
+ *orig_start = key.offset - nocow_args.extent_offset;
+ if (orig_block_len)
+ *orig_block_len = nocow_args.disk_num_bytes;
- /*
- * look for other files referencing this extent, if we
- * find any we must cow
- */
-
- ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr,
- strict);
- if (ret) {
- ret = 0;
- goto out;
- }
-
- /*
- * adjust disk_bytenr and num_bytes to cover just the bytes
- * in this extent we are about to write. If there
- * are any csums in that range we have to cow in order
- * to keep the csums correct
- */
- disk_bytenr += backref_offset;
- disk_bytenr += offset - key.offset;
- if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
- goto out;
- /*
- * all of the above have passed, it is safe to overwrite this extent
- * without cow
- */
- *len = num_bytes;
+ *len = nocow_args.num_bytes;
ret = 1;
out:
btrfs_free_path(path);
@@ -7289,14 +7244,22 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, bool writing)
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
int ret = 0;
while (1) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ lock_extent(io_tree, lockstart, lockend, cached_state);
+ }
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure there's no ordered
@@ -7317,10 +7280,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
/*
* If we are doing a DIO read and the ordered extent we
* found is for a buffered write, we can not wait for it
@@ -7340,7 +7307,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
btrfs_start_ordered_extent(ordered, 1);
else
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
btrfs_put_ordered_extent(ordered);
} else {
/*
@@ -7356,7 +7323,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* ordered extent to complete while holding a lock on
* that page.
*/
- ret = -ENOTBLK;
+ ret = nowait ? -EAGAIN : -ENOTBLK;
}
if (ret)
@@ -7375,7 +7342,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 ram_bytes, int compress_type,
int type)
{
- struct extent_map_tree *em_tree;
struct extent_map *em;
int ret;
@@ -7384,7 +7350,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
- em_tree = &inode->extent_tree;
em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7405,18 +7370,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->compress_type = compress_type;
}
- do {
- btrfs_drop_extent_cache(inode, em->start,
- em->start + em->len - 1, 0);
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- /*
- * The caller has taken lock_extent(), who could race with us
- * to add em?
- */
- } while (ret == -EEXIST);
-
+ ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
free_extent_map(em);
return ERR_PTR(ret);
@@ -7430,14 +7384,18 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct inode *inode,
struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
+ u64 start, u64 len,
+ unsigned int iomap_flags)
{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
int type;
u64 block_start, orig_start, orig_block_len, ram_bytes;
+ struct btrfs_block_group *bg;
bool can_nocow = false;
bool space_reserved = false;
+ u64 prev_len;
int ret = 0;
/*
@@ -7460,21 +7418,27 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false) == 1 &&
- btrfs_inc_nocow_writers(fs_info, block_start))
- can_nocow = true;
+ &orig_block_len, &ram_bytes, false, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
+ }
}
+ prev_len = len;
if (can_nocow) {
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
- btrfs_dec_nocow_writers(fs_info, block_start);
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
goto out;
}
space_reserved = true;
@@ -7483,27 +7447,40 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
- btrfs_dec_nocow_writers(fs_info, block_start);
+ btrfs_dec_nocow_writers(bg);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
- *map = em = em2;
+ *map = em2;
+ em = em2;
}
if (IS_ERR(em2)) {
ret = PTR_ERR(em2);
goto out;
}
- } else {
- const u64 prev_len = len;
+ dio_data->nocow_done = true;
+ } else {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
- /* We have to COW, so need to reserve metadata and data space. */
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, len);
+ if (nowait)
+ return -EAGAIN;
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved)
+ return -ENOSPC;
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
if (ret < 0)
goto out;
space_reserved = true;
@@ -7516,17 +7493,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
*map = em;
len = min(len, em->len - (start - em->start));
if (len < prev_len)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start + len, prev_len - len,
- true);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
}
/*
* We have created our ordered extent, so we can now release our reservation
* for an outstanding extent.
*/
- btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
/*
* Need to update the i_size under the extent lock so buffered
@@ -7537,15 +7512,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
out:
if (ret && space_reserved) {
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
- if (can_nocow) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
- } else {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start, len, true);
- extent_changeset_free(dio_data->data_reserved);
- dio_data->data_reserved = NULL;
- }
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
}
return ret;
}
@@ -7554,51 +7521,104 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
loff_t length, unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = NULL;
+ struct btrfs_dio_data *dio_data = iter->private;
u64 lockstart, lockend;
const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
u64 len = length;
+ const u64 data_alloc_len = length;
bool unlock_extents = false;
+ /*
+ * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+ * we're NOWAIT we may submit a bio for a partial range and return
+ * EIOCBQUEUED, which would result in an errant short read.
+ *
+ * The best way to handle this would be to allow for partial completions
+ * of iocb's, so we could submit the partial bio, return and fault in
+ * the rest of the pages, and then submit the io for the rest of the
+ * range. However we don't have that currently, so simply return
+ * -EAGAIN at this point so that the normal path is used.
+ */
+ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+ return -EAGAIN;
+
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
if (!write)
- len = min_t(u64, len, fs_info->sectorsize);
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
lockstart = start;
lockend = start + len - 1;
/*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so we
- * need to flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk.
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
*/
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags)) {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
- if (ret)
- return ret;
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
}
- dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
- if (!dio_data)
- return -ENOMEM;
-
- iomap->private = dio_data;
+ memset(dio_data, 0, sizeof(*dio_data));
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len, false);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
+ }
/*
* If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered.
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
- ret = -ENOTBLK;
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
goto err;
- }
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
@@ -7623,7 +7643,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- ret = -ENOTBLK;
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
goto unlock_err;
}
@@ -7658,12 +7690,30 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, len);
+ start, len, flags);
if (ret < 0)
goto unlock_err;
unlock_extents = true;
/* Recalc len in case the new em is smaller than requested */
len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
+
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
+ }
} else {
/*
* We need to unlock only the end area that we aren't using.
@@ -7675,8 +7725,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
if (unlock_extents)
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
else
free_extent_state(cached_state);
@@ -7705,10 +7755,15 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
return 0;
unlock_err:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
err:
- kfree(dio_data);
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
return ret;
}
@@ -7716,35 +7771,33 @@ err:
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
- int ret = 0;
- struct btrfs_dio_data *dio_data = iomap->private;
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
size_t submitted = dio_data->submitted;
const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
- goto out;
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+ NULL);
+ return 0;
}
if (submitted < length) {
pos += submitted;
length -= submitted;
if (write)
- __endio_write_update_ordered(BTRFS_I(inode), pos,
- length, false);
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+ pos, length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1);
+ pos + length - 1, NULL);
ret = -ENOTBLK;
}
if (write)
extent_changeset_free(dio_data->data_reserved);
-out:
- kfree(dio_data);
- iomap->private = NULL;
-
return ret;
}
@@ -7757,40 +7810,31 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (!refcount_dec_and_test(&dip->refs))
return;
- if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
- __endio_write_update_ordered(BTRFS_I(dip->inode),
- dip->file_offset,
- dip->bytes,
- !dip->dio_bio->bi_status);
+ if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
+ btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+ dip->file_offset, dip->bytes,
+ !dip->bio.bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
dip->file_offset,
- dip->file_offset + dip->bytes - 1);
+ dip->file_offset + dip->bytes - 1, NULL);
}
- bio_endio(dip->dio_bio);
- kfree(dip);
+ kfree(dip->csums);
+ bio_endio(&dip->bio);
}
-static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ int mirror_num,
+ enum btrfs_compression_type compress_type)
{
- struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- blk_status_t ret;
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- return ret;
-
refcount_inc(&dip->refs);
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- if (ret)
- refcount_dec(&dip->refs);
- return ret;
+ btrfs_submit_bio(fs_info, bio, mirror_num);
}
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
@@ -7799,60 +7843,32 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
{
struct inode *inode = dip->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
- struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
- struct bio_vec bvec;
- struct bvec_iter iter;
- const u64 orig_file_offset = dip->file_offset;
- u64 start = orig_file_offset;
- u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ u32 offset;
- __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
- unsigned int i, nr_sectors, pgoff;
+ btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+ u64 start = bbio->file_offset + offset;
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
- pgoff = bvec.bv_offset;
- for (i = 0; i < nr_sectors; i++) {
- ASSERT(pgoff < PAGE_SIZE);
- if (uptodate &&
- (!csum || !check_data_csum(inode, bbio,
- bio_offset, bvec.bv_page,
- pgoff, start))) {
- clean_io_failure(fs_info, failure_tree, io_tree,
- start, bvec.bv_page,
- btrfs_ino(BTRFS_I(inode)),
- pgoff);
- } else {
- int ret;
-
- ASSERT((start - orig_file_offset) < UINT_MAX);
- ret = btrfs_repair_one_sector(inode,
- &bbio->bio,
- start - orig_file_offset,
- bvec.bv_page, pgoff,
- start, bbio->mirror_num,
- submit_dio_repair_bio);
- if (ret)
- err = errno_to_blk_status(ret);
- }
- start += sectorsize;
- ASSERT(bio_offset + sectorsize > bio_offset);
- bio_offset += sectorsize;
- pgoff += sectorsize;
+ if (uptodate &&
+ (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
+ bv.bv_offset))) {
+ btrfs_clean_io_failure(BTRFS_I(inode), start,
+ bv.bv_page, bv.bv_offset);
+ } else {
+ int ret;
+
+ ret = btrfs_repair_one_sector(inode, bbio, offset,
+ bv.bv_page, bv.bv_offset,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
}
- return err;
-}
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
- const u64 offset, const u64 bytes,
- const bool uptodate)
-{
- btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
- finish_ordered_fn, uptodate);
+ return err;
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -7862,9 +7878,10 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
}
-static void btrfs_end_dio_bio(struct bio *bio)
+static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
{
- struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_dio_private *dip = bbio->private;
+ struct bio *bio = &bbio->bio;
blk_status_t err = bio->bi_status;
if (err)
@@ -7875,108 +7892,65 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ)
- err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
+ err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
- dip->dio_bio->bi_status = err;
+ dip->bio.bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
}
-static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
- struct inode *inode, u64 file_offset, int async_submit)
+static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+ u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_private *dip = bio->bi_private;
- bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
+ struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
blk_status_t ret;
- /* Check btrfs_submit_bio_hook() for rules about async submit. */
- if (async_submit)
- async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
- if (!write) {
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret)
- goto err;
- }
+ /* Save the original iter for read repair */
+ if (btrfs_op(bio) == BTRFS_MAP_READ)
+ btrfs_bio(bio)->iter = bio->bi_iter;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
goto map;
- if (write && async_submit) {
- ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
- btrfs_submit_bio_start_direct_io);
- goto err;
- } else if (write) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ /* Check btrfs_submit_data_write_bio() for async submit rules */
+ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+ btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+ btrfs_submit_bio_start_direct_io))
+ return;
+
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
- if (ret)
- goto err;
+ if (ret) {
+ btrfs_bio_end_io(btrfs_bio(bio), ret);
+ return;
+ }
} else {
- u64 csum_offset;
-
- csum_offset = file_offset - dip->file_offset;
- csum_offset >>= fs_info->sectorsize_bits;
- csum_offset *= fs_info->csum_size;
- btrfs_bio(bio)->csum = dip->csums + csum_offset;
+ btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
+ file_offset - dip->file_offset);
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0);
-err:
- return ret;
-}
-
-/*
- * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
- * or ordered extents whether or not we submit any bios.
- */
-static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
- struct inode *inode,
- loff_t file_offset)
-{
- const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
- size_t dip_size;
- struct btrfs_dio_private *dip;
-
- dip_size = sizeof(*dip);
- if (!write && csum) {
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- size_t nblocks;
-
- nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
- dip_size += fs_info->csum_size * nblocks;
- }
-
- dip = kzalloc(dip_size, GFP_NOFS);
- if (!dip)
- return NULL;
-
- dip->inode = inode;
- dip->file_offset = file_offset;
- dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
- dip->dio_bio = dio_bio;
- refcount_set(&dip->refs, 1);
- return dip;
+ btrfs_submit_bio(fs_info, bio, 0);
}
static void btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
+ struct btrfs_dio_private *dip =
+ container_of(dio_bio, struct btrfs_dio_private, bio);
struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
- struct btrfs_dio_private *dip;
struct bio *bio;
u64 start_sector;
int async_submit = 0;
@@ -7987,27 +7961,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iter->iomap.private;
+ struct btrfs_dio_data *dio_data = iter->private;
struct extent_map *em = NULL;
- dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
- if (!dip) {
- if (!write) {
- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
- file_offset + dio_bio->bi_iter.bi_size - 1);
- }
- dio_bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(dio_bio);
- return;
- }
+ dip->inode = inode;
+ dip->file_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ refcount_set(&dip->refs, 1);
+ dip->csums = NULL;
+
+ if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ unsigned int nr_sectors =
+ (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
- if (!write) {
/*
* Load the csums up front to reduce csum tree searches and
* contention when submitting bios.
- *
- * If we have csums disabled this will do nothing.
*/
+ status = BLK_STS_RESOURCE;
+ dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS);
+ if (!dip)
+ goto out_err;
+
status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
if (status != BLK_STS_OK)
goto out_err;
@@ -8038,9 +8013,9 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
* This will never fail as it's passing GPF_NOFS and
* the allocation is backed by btrfs_bioset.
*/
- bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
+ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
+ btrfs_end_dio_bio, dip);
+ btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -8075,14 +8050,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
async_submit = 1;
}
- status = btrfs_submit_dio_bio(bio, inode, file_offset,
- async_submit);
- if (status) {
- bio_put(bio);
- if (submit_len > 0)
- refcount_dec(&dip->refs);
- goto out_err_em;
- }
+ btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
dio_data->submitted += clone_len;
clone_offset += clone_len;
@@ -8096,19 +8064,29 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
out_err_em:
free_extent_map(em);
out_err:
- dip->dio_bio->bi_status = status;
+ dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
}
-const struct iomap_ops btrfs_dio_iomap_ops = {
+static const struct iomap_ops btrfs_dio_iomap_ops = {
.iomap_begin = btrfs_dio_iomap_begin,
.iomap_end = btrfs_dio_iomap_end,
};
-const struct iomap_dio_ops btrfs_dio_ops = {
+static const struct iomap_dio_ops btrfs_dio_ops = {
.submit_io = btrfs_submit_direct,
+ .bio_set = &btrfs_dio_bioset,
};
+ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+{
+ struct btrfs_dio_data data;
+
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
+ &data, done_before);
+}
+
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -8118,53 +8096,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
-}
-
-int btrfs_readpage(struct file *file, struct page *page)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- struct btrfs_bio_ctrl bio_ctrl = { 0 };
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
- if (bio_ctrl.bio) {
- int ret2;
-
- ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
-}
-
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- int ret;
-
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
- }
-
/*
- * If we are under memory pressure we will call this directly from the
- * VM, we need to make sure we have the inode referenced for the ordered
- * extent. If not just return like we didn't do anything.
- */
- if (!igrab(inode)) {
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret)
+ return ret;
}
- ret = extent_write_full_page(page, wbc);
- btrfs_add_delayed_iput(inode);
- return ret;
+
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
static int btrfs_writepages(struct address_space *mapping,
@@ -8179,7 +8130,7 @@ static void btrfs_readahead(struct readahead_control *rac)
}
/*
- * For releasepage() and invalidate_folio() we have a race window where
+ * For release_folio() and invalidate_folio() we have a race window where
* folio_end_writeback() is called but the subpage spinlock is not yet released.
* If we continue to release/invalidate the page, we could cause use-after-free
* for subpage spinlock. So this function is to spin and wait for subpage
@@ -8190,7 +8141,7 @@ static void wait_subpage_spinlock(struct page *page)
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->private);
@@ -8211,49 +8162,43 @@ static void wait_subpage_spinlock(struct page *page)
spin_unlock_irq(&subpage->lock);
}
-static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- int ret = try_release_extent_mapping(page, gfp_flags);
+ int ret = try_release_extent_mapping(&folio->page, gfp_flags);
if (ret == 1) {
- wait_subpage_spinlock(page);
- clear_page_extent_mapped(page);
+ wait_subpage_spinlock(&folio->page);
+ clear_page_extent_mapped(&folio->page);
}
return ret;
}
-static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- if (PageWriteback(page) || PageDirty(page))
- return 0;
- return __btrfs_releasepage(page, gfp_flags);
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return false;
+ return __btrfs_release_folio(folio, gfp_flags);
}
#ifdef CONFIG_MIGRATION
-static int btrfs_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page,
+static int btrfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
- int ret;
+ int ret = filemap_migrate_folio(mapping, dst, src, mode);
- ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (page_has_private(page))
- attach_page_private(newpage, detach_page_private(page));
-
- if (PageOrdered(page)) {
- ClearPageOrdered(page);
- SetPageOrdered(newpage);
+ if (folio_test_ordered(src)) {
+ folio_clear_ordered(src);
+ folio_set_ordered(dst);
}
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
+#else
+#define btrfs_migrate_folio NULL
#endif
static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
@@ -8297,19 +8242,19 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* still safe to wait for ordered extent to finish.
*/
if (!(offset == 0 && length == folio_size(folio))) {
- btrfs_releasepage(&folio->page, GFP_NOFS);
+ btrfs_release_folio(folio, GFP_NOFS);
return;
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, &cached_state);
+ lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
struct btrfs_ordered_extent *ordered;
- bool delete_states;
u64 range_end;
u32 range_len;
+ u32 extra_flags = 0;
ordered = btrfs_lookup_first_ordered_range(inode, cur,
page_end + 1 - cur);
@@ -8319,7 +8264,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* No ordered extent covering this range, we are safe
* to delete all extent states in the range.
*/
- delete_states = true;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
goto next;
}
if (ordered->file_offset > cur) {
@@ -8330,7 +8275,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* the ordered extent in the next iteration.
*/
range_end = ordered->file_offset - 1;
- delete_states = true;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
goto next;
}
@@ -8345,7 +8290,6 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
*/
- delete_states = false;
goto next;
}
btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
@@ -8362,7 +8306,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, &cached_state);
+ EXTENT_DEFRAG, &cached_state);
spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8370,6 +8314,12 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
cur - ordered->file_offset);
spin_unlock_irq(&inode->ordered_tree.lock);
+ /*
+ * If the ordered extent has finished, we're safe to delete all
+ * the extent states of the range, otherwise
+ * btrfs_finish_ordered_io() will get executed by endio for
+ * other pages, so we can't delete extent states.
+ */
if (btrfs_dec_test_ordered_pending(inode, &ordered,
cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
@@ -8377,14 +8327,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* The ordered extent has finished, now we're again
* safe to delete all extent states of the range.
*/
- delete_states = true;
- } else {
- /*
- * btrfs_finish_ordered_io() will get executed by endio
- * of other pages, thus we can't delete extent states
- * anymore
- */
- delete_states = false;
+ extra_flags = EXTENT_CLEAR_ALL_BITS;
}
next:
if (ordered)
@@ -8408,8 +8351,8 @@ next:
if (!inode_evicting) {
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
- delete_states, &cached_state);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
+ extra_flags, &cached_state);
}
cur = range_end + 1;
}
@@ -8421,7 +8364,7 @@ next:
ASSERT(!folio_test_ordered(folio));
btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
- __btrfs_releasepage(&folio->page, GFP_NOFS);
+ __btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
}
@@ -8470,7 +8413,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* Reserving delalloc space after obtaining the page lock can lead to
* deadlock. For example, if a dirty page is locked by this function
* and the call to btrfs_delalloc_reserve_space() ends up triggering
- * dirty page write out, then the btrfs_writepage() function could
+ * dirty page write out, then the btrfs_writepages() function could
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
@@ -8500,11 +8443,11 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, &cached_state);
+ lock_extent(io_tree, page_start, page_end, &cached_state);
ret2 = set_page_extent_mapped(page);
if (ret2 < 0) {
ret = vmf_error(ret2);
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -8515,8 +8458,7 @@ again:
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
PAGE_SIZE);
if (ordered) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered, 1);
@@ -8544,13 +8486,12 @@ again:
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, &cached_state);
+ EXTENT_DEFRAG, &cached_state);
ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
- unlock_extent_cached(io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
@@ -8561,17 +8502,16 @@ again:
else
zero_start = PAGE_SIZE;
- if (zero_start != PAGE_SIZE) {
+ if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- flush_dcache_page(page);
- }
+
btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ unlock_extent(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
@@ -8647,7 +8587,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
if (!rsv)
return -ENOMEM;
rsv->size = min_size;
- rsv->failfast = 1;
+ rsv->failfast = true;
/*
* 1 for the truncate slack space
@@ -8672,24 +8612,24 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
control.new_size = new_size;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
&cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
* block of the extent just the way it is.
*/
- btrfs_drop_extent_cache(BTRFS_I(inode),
- ALIGN(new_size, fs_info->sectorsize),
- (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ ALIGN(new_size, fs_info->sectorsize),
+ (u64)-1, false);
ret = btrfs_truncate_inode_items(trans, root, &control);
inode_sub_bytes(inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
- (u64)-1, &cached_state);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
@@ -8772,46 +8712,23 @@ out:
return ret;
}
-/*
- * create a new subvolume directory/inode (helper for the ioctl).
- */
-int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- struct user_namespace *mnt_userns)
+struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
struct inode *inode;
- int err;
- u64 index = 0;
- u64 ino;
-
- err = btrfs_get_free_objectid(new_root, &ino);
- if (err < 0)
- return err;
-
- inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
- ino, ino,
- S_IFDIR | (~current_umask() & S_IRWXUGO),
- &index);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- inode->i_op = &btrfs_dir_inode_operations;
- inode->i_fop = &btrfs_dir_file_operations;
-
- set_nlink(inode, 1);
- btrfs_i_size_write(BTRFS_I(inode), 0);
- unlock_new_inode(inode);
-
- err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
- if (err)
- btrfs_err(new_root->fs_info,
- "error inheriting subvolume %llu properties: %d",
- new_root->root_key.objectid, err);
- err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
-
- iput(inode);
- return err;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ /*
+ * Subvolumes don't inherit the sgid bit or the parent's gid if
+ * the parent's sgid bit is set. This is probably a bug.
+ */
+ inode_init_owner(mnt_userns, inode, NULL,
+ S_IFDIR | (~current_umask() & S_IRWXUGO));
+ inode->i_op = &btrfs_dir_inode_operations;
+ inode->i_fop = &btrfs_dir_file_operations;
+ }
+ return inode;
}
struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -8843,6 +8760,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
+ spin_lock_init(&ei->io_failure_lock);
ei->outstanding_extents = 0;
if (sb->s_magic != BTRFS_TEST_MAGIC)
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
@@ -8859,12 +8777,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
- extent_io_tree_init(fs_info, &ei->io_failure_tree,
- IO_TREE_INODE_IO_FAILURE, inode);
extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT, inode);
- ei->io_tree.track_uptodate = true;
- ei->io_failure_tree.track_uptodate = true;
+ IO_TREE_INODE_FILE_EXTENT, NULL);
+ ei->io_failure_tree = RB_ROOT;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -8879,7 +8794,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
@@ -8894,6 +8809,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
struct btrfs_ordered_extent *ordered;
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_root *root = inode->root;
+ bool freespace_inode;
WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
WARN_ON(vfs_inode->i_data.nrpages);
@@ -8915,6 +8831,12 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
if (!root)
return;
+ /*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
if (!ordered)
@@ -8923,6 +8845,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
+
+ if (!freespace_inode)
+ btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
+
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -8930,7 +8856,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
}
@@ -8951,7 +8877,7 @@ int btrfs_drop_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+ struct btrfs_inode *ei = foo;
inode_init_once(&ei->vfs_inode);
}
@@ -8963,6 +8889,7 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
+ bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
kmem_cache_destroy(btrfs_trans_handle_cachep);
kmem_cache_destroy(btrfs_path_cachep);
@@ -9003,6 +8930,11 @@ int __init btrfs_init_cachep(void)
if (!btrfs_free_space_bitmap_cachep)
goto fail;
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bio),
+ BIOSET_NEED_BVECS))
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -9058,6 +8990,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
+ unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
@@ -9089,14 +9022,37 @@ static int btrfs_rename_exchange(struct inode *old_dir,
down_read(&fs_info->subvol_sem);
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
- * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
- * should cover the worst case number of items we'll modify.
+ * For each inode:
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ * 1 to update parent inode
+ *
+ * If the parents are the same, we only need to account for one
*/
- trans = btrfs_start_transaction(root, 12);
+ trans_num_items = (old_dir == new_dir ? 9 : 10);
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode item
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ trans_num_items += 4;
+ else
+ trans_num_items += 3;
+ trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
@@ -9161,8 +9117,10 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
inode_inc_iversion(new_inode);
- old_dir->i_ctime = old_dir->i_mtime = ctime;
- new_dir->i_ctime = new_dir->i_mtime = ctime;
+ old_dir->i_mtime = ctime;
+ old_dir->i_ctime = ctime;
+ new_dir->i_mtime = ctime;
+ new_dir->i_ctime = ctime;
old_inode->i_ctime = ctime;
new_inode->i_ctime = ctime;
@@ -9263,56 +9221,19 @@ out_notrans:
return ret;
}
-static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct user_namespace *mnt_userns,
- struct inode *dir,
- struct dentry *dentry)
+static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,
+ struct inode *dir)
{
- int ret;
struct inode *inode;
- u64 objectid;
- u64 index;
-
- ret = btrfs_get_free_objectid(root, &objectid);
- if (ret)
- return ret;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name,
- dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)),
- objectid,
- S_IFCHR | WHITEOUT_MODE,
- &index);
-
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- return ret;
+ inode = new_inode(dir->i_sb);
+ if (inode) {
+ inode_init_owner(mnt_userns, inode, dir,
+ S_IFCHR | WHITEOUT_MODE);
+ inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
}
-
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode,
- WHITEOUT_DEV);
-
- ret = btrfs_init_inode_security(trans, inode, dir,
- &dentry->d_name);
- if (ret)
- goto out;
-
- ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (ret)
- goto out;
-
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
-out:
- unlock_new_inode(inode);
- if (ret)
- inode_dec_link_count(inode);
- iput(inode);
-
- return ret;
+ return inode;
}
static int btrfs_rename(struct user_namespace *mnt_userns,
@@ -9321,6 +9242,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
+ struct btrfs_new_inode_args whiteout_args = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ };
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -9375,23 +9300,56 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);
- /* close the racy window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (flags & RENAME_WHITEOUT) {
+ whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
+ if (!whiteout_args.inode)
+ return -ENOMEM;
+ ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
+ if (ret)
+ goto out_whiteout_inode;
+ } else {
+ /* 1 to update the old parent inode. */
+ trans_num_items = 1;
+ }
+
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ /* Close the race window with snapshot create/destroy ioctl */
down_read(&fs_info->subvol_sem);
+ /*
+ * 1 to remove old root ref
+ * 1 to remove old root backref
+ * 1 to add new root ref
+ * 1 to add new root backref
+ */
+ trans_num_items += 4;
+ } else {
+ /*
+ * 1 to update inode
+ * 1 to remove old inode ref
+ * 1 to add new inode ref
+ */
+ trans_num_items += 3;
+ }
/*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume they are normal
- * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- * should cover the worst case number of items we'll modify.
- * If our rename has the whiteout flag, we need more 5 units for the
- * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
- * when selinux is enabled).
- */
- trans_num_items = 11;
- if (flags & RENAME_WHITEOUT)
+ * 1 to remove old dir item
+ * 1 to remove old dir index
+ * 1 to add new dir item
+ * 1 to add new dir index
+ */
+ trans_num_items += 4;
+ /* 1 to update new parent inode if it's not the same as the old parent */
+ if (new_dir != old_dir)
+ trans_num_items++;
+ if (new_inode) {
+ /*
+ * 1 to update inode
+ * 1 to remove inode ref
+ * 1 to remove dir item
+ * 1 to remove dir index
+ * 1 to possibly add orphan item
+ */
trans_num_items += 5;
+ }
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -9425,9 +9383,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- old_inode->i_ctime = current_time(old_dir);
+ old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_ctime = old_dir->i_mtime;
+ new_dir->i_mtime = old_dir->i_mtime;
+ new_dir->i_ctime = old_dir->i_mtime;
+ old_inode->i_ctime = old_dir->i_mtime;
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9487,12 +9447,14 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
- old_dir, old_dentry);
-
+ ret = btrfs_create_new_inode(trans, &whiteout_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
+ } else {
+ unlock_new_inode(whiteout_args.inode);
+ iput(whiteout_args.inode);
+ whiteout_args.inode = NULL;
}
}
out_fail:
@@ -9501,7 +9463,11 @@ out_fail:
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
-
+ if (flags & RENAME_WHITEOUT)
+ btrfs_new_inode_args_destroy(&whiteout_args);
+out_whiteout_inode:
+ if (flags & RENAME_WHITEOUT)
+ iput(whiteout_args.inode);
return ret;
}
@@ -9509,15 +9475,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
+ int ret;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
- new_dentry);
+ ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+ new_dentry);
+ else
+ ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
- return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
- new_dentry, flags);
+ btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
+
+ return ret;
}
struct btrfs_delalloc_work {
@@ -9720,10 +9692,13 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct btrfs_key key;
- struct inode *inode = NULL;
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ };
+ unsigned int trans_num_items;
int err;
- u64 objectid;
- u64 index = 0;
int name_len;
int datasize;
unsigned long ptr;
@@ -9734,49 +9709,40 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return -ENAMETOOLONG;
- /*
- * 2 items for inode item and ref
- * 2 items for dir items
- * 1 item for updating parent inode item
- * 1 item for the inline extent item
- * 1 item for xattr if selinux is on
- */
- trans = btrfs_start_transaction(root, 7);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO);
+ inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &btrfs_aops;
+ btrfs_i_size_write(BTRFS_I(inode), name_len);
+ inode_set_bytes(inode, name_len);
- err = btrfs_get_free_objectid(root, &objectid);
+ new_inode_args.inode = inode;
+ err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (err)
- goto out_unlock;
+ goto out_inode;
+ /* 1 additional item for the inline extent */
+ trans_num_items++;
- inode = btrfs_new_inode(trans, root, mnt_userns, dir,
- dentry->d_name.name, dentry->d_name.len,
- btrfs_ino(BTRFS_I(dir)), objectid,
- S_IFLNK | S_IRWXUGO, &index);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- inode = NULL;
- goto out_unlock;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_new_inode_args;
}
- /*
- * If the active LSM wants to access the inode during
- * d_instantiate it needs these. Smack checks to see
- * if the filesystem supports xattrs by looking at the
- * ops vector.
- */
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
- inode->i_mapping->a_ops = &btrfs_aops;
-
- err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+ err = btrfs_create_new_inode(trans, &new_inode_args);
if (err)
- goto out_unlock;
+ goto out;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
- goto out_unlock;
+ btrfs_abort_transaction(trans, err);
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
key.offset = 0;
@@ -9785,8 +9751,11 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (err) {
+ btrfs_abort_transaction(trans, err);
btrfs_free_path(path);
- goto out_unlock;
+ discard_new_inode(inode);
+ inode = NULL;
+ goto out;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -9804,31 +9773,16 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode_set_bytes(inode, name_len);
- btrfs_i_size_write(BTRFS_I(inode), name_len);
- err = btrfs_update_inode(trans, root, BTRFS_I(inode));
- /*
- * Last step, add directory indexes for our symlink inode. This is the
- * last step to avoid extra cleanup of these indexes if an error happens
- * elsewhere above.
- */
- if (!err)
- err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
- BTRFS_I(inode), 0, index);
- if (err)
- goto out_unlock;
-
d_instantiate_new(dentry, inode);
-
-out_unlock:
+ err = 0;
+out:
btrfs_end_transaction(trans);
- if (err && inode) {
- inode_dec_link_count(inode);
- discard_new_inode(inode);
- }
btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (err)
+ iput(inode);
return err;
}
@@ -9877,6 +9831,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
+ extent_info.update_times = true;
extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
@@ -9914,7 +9869,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
@@ -9970,11 +9924,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
break;
}
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset -1, 0);
-
em = alloc_extent_map();
if (!em) {
+ btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
+ cur_offset + ins.offset - 1, false);
btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -9989,16 +9942,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 1);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST)
- break;
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
- cur_offset + ins.offset - 1,
- 0);
- }
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
free_extent_map(em);
next:
num_bytes -= ins.offset;
@@ -10079,62 +10023,58 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = NULL;
- u64 objectid;
- u64 index;
- int ret = 0;
-
- /*
- * 5 units required for adding orphan entry
- */
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_get_free_objectid(root, &objectid);
- if (ret)
- goto out;
-
- inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
- btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- inode = NULL;
- goto out;
- }
+ struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .orphan = true,
+ };
+ unsigned int trans_num_items;
+ int ret;
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return -ENOMEM;
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
-
inode->i_mapping->a_ops = &btrfs_aops;
- ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ new_inode_args.inode = inode;
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (ret)
- goto out;
+ goto out_inode;
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (ret)
- goto out;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret)
- goto out;
+ trans = btrfs_start_transaction(root, trans_num_items);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_new_inode_args;
+ }
+
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
/*
- * We set number of links to 0 in btrfs_new_inode(), and here we set
- * it to 1 because d_tmpfile() will issue a warning if the count is 0,
- * through:
+ * We set number of links to 0 in btrfs_create_new_inode(), and here we
+ * set it to 1 because d_tmpfile() will issue a warning if the count is
+ * 0, through:
*
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
*/
set_nlink(inode, 1);
- d_tmpfile(dentry, inode);
- unlock_new_inode(inode);
- mark_inode_dirty(inode);
-out:
+
+ if (!ret) {
+ d_tmpfile(dentry, inode);
+ unlock_new_inode(inode);
+ mark_inode_dirty(inode);
+ }
+
btrfs_end_transaction(trans);
- if (ret && inode)
- discard_new_inode(inode);
btrfs_btree_balance_dirty(fs_info);
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ if (ret)
+ iput(inode);
return ret;
}
@@ -10158,9 +10098,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
}
}
-static int btrfs_encoded_io_compression_from_extent(
- struct btrfs_fs_info *fs_info,
- int compress_type)
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+ int compress_type)
{
switch (compress_type) {
case BTRFS_COMPRESS_NONE:
@@ -10257,7 +10196,7 @@ static ssize_t btrfs_encoded_read_inline(
}
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
- unlock_extent_cached(io_tree, start, lockend, cached_state);
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -10282,8 +10221,7 @@ struct btrfs_encoded_read_private {
static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
struct bio *bio, int mirror_num)
{
- struct btrfs_encoded_read_private *priv = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
blk_status_t ret;
@@ -10293,31 +10231,20 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
return ret;
}
- ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
- if (ret) {
- btrfs_bio_free_csum(bbio);
- return ret;
- }
-
atomic_inc(&priv->pending);
- ret = btrfs_map_bio(fs_info, bio, mirror_num);
- if (ret) {
- atomic_dec(&priv->pending);
- btrfs_bio_free_csum(bbio);
- }
- return ret;
+ btrfs_submit_bio(fs_info, bio, mirror_num);
+ return BLK_STS_OK;
}
static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
{
const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
- struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+ struct btrfs_encoded_read_private *priv = bbio->private;
struct btrfs_inode *inode = priv->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u32 sectorsize = fs_info->sectorsize;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
- u64 start = priv->file_offset;
u32 bio_offset = 0;
if (priv->skip_csum || !uptodate)
@@ -10330,10 +10257,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
pgoff = bvec->bv_offset;
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
- if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
- bvec->bv_page, pgoff, start))
+ if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff))
return BLK_STS_IOERR;
- start += sectorsize;
bio_offset += sectorsize;
pgoff += sectorsize;
}
@@ -10341,10 +10267,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
return BLK_STS_OK;
}
-static void btrfs_encoded_read_endio(struct bio *bio)
+static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
{
- struct btrfs_encoded_read_private *priv = bio->bi_private;
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_encoded_read_private *priv = bbio->private;
blk_status_t status;
status = btrfs_encoded_read_verify_csum(bbio);
@@ -10362,14 +10287,12 @@ static void btrfs_encoded_read_endio(struct bio *bio)
if (!atomic_dec_return(&priv->pending))
wake_up(&priv->wait);
btrfs_bio_free_csum(bbio);
- bio_put(bio);
+ bio_put(&bbio->bio);
}
-static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset,
- u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages)
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset, u64 disk_bytenr,
+ u64 disk_io_size, struct page **pages)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_encoded_read_private priv = {
@@ -10411,12 +10334,11 @@ static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, remaining, PAGE_SIZE);
if (!bio) {
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+ btrfs_encoded_read_endio,
+ &priv);
bio->bi_iter.bi_sector =
(disk_bytenr + cur) >> SECTOR_SHIFT;
- bio->bi_end_io = btrfs_encoded_read_endio;
- bio->bi_private = &priv;
- bio->bi_opf = REQ_OP_READ;
}
if (!bytes ||
@@ -10466,20 +10388,18 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- ret = -ENOMEM;
- goto out;
+ ret = btrfs_alloc_page_array(nr_pages, pages);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
}
- }
ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
disk_io_size, pages);
if (ret)
goto out;
- unlock_extent_cached(io_tree, start, lockend, cached_state);
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -10549,13 +10469,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1);
if (ret)
goto out_unlock_inode;
- lock_extent_bits(io_tree, start, lockend, &cached_state);
+ lock_extent(io_tree, start, lockend, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
cond_resched();
}
@@ -10602,7 +10522,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = count = em->block_len;
+ disk_io_size = em->block_len;
+ count = em->block_len;
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
@@ -10628,7 +10549,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
em = NULL;
if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
@@ -10649,7 +10570,7 @@ out_em:
free_extent_map(em);
out_unlock_extent:
if (!unlocked)
- unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ unlock_extent(io_tree, start, lockend, &cached_state);
out_unlock_inode:
if (!unlocked)
btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
@@ -10765,15 +10686,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = -ENOMEM;
goto out_pages;
}
- kaddr = kmap(pages[i]);
+ kaddr = kmap_local_page(pages[i]);
if (copy_from_iter(kaddr, bytes, from) != bytes) {
- kunmap(pages[i]);
+ kunmap_local(kaddr);
ret = -EFAULT;
goto out_pages;
}
if (bytes < PAGE_SIZE)
memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
- kunmap(pages[i]);
+ kunmap_local(kaddr);
}
for (;;) {
@@ -10787,14 +10708,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
end >> PAGE_SHIFT);
if (ret)
goto out_pages;
- lock_extent_bits(io_tree, start, end, &cached_state);
+ lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
@@ -10808,7 +10729,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
if (ret)
goto out_free_data_space;
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
+ false);
if (ret)
goto out_qgroup_free_data;
@@ -10847,7 +10769,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
(1 << BTRFS_ORDERED_COMPRESSED),
compression);
if (ret) {
- btrfs_drop_extent_cache(inode, start, end, 0);
+ btrfs_drop_extent_map_range(inode, start, end, false);
goto out_free_reserved;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -10855,7 +10777,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (start + encoded->len > inode->vfs_inode.i_size)
i_size_write(&inode->vfs_inode, start + encoded->len);
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
btrfs_delalloc_release_extents(inode, num_bytes);
@@ -10886,7 +10808,7 @@ out_free_data_space:
if (!extent_reserved)
btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
- unlock_extent_cached(io_tree, start, end, &cached_state);
+ unlock_extent(io_tree, start, end, &cached_state);
out_pages:
for (i = 0; i < nr_pages; i++) {
if (pages[i])
@@ -11107,12 +11029,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
+ *
+ * It is possible that subvolume is marked for deletion but still not
+ * removed yet. To prevent this race, we check the root status before
+ * activating the swapfile.
*/
+ spin_lock(&root->root_item_lock);
+ if (btrfs_root_dead(root)) {
+ spin_unlock(&root->root_item_lock);
+
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because subvolume %llu is being deleted",
+ root->root_key.objectid);
+ return -EPERM;
+ }
atomic_inc(&root->nr_swapfiles);
+ spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+ lock_extent(io_tree, 0, isize - 1, &cached_state);
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
@@ -11153,7 +11090,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -11249,7 +11186,7 @@ out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
- unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+ unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
@@ -11302,6 +11239,41 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
spin_unlock(&inode->lock);
}
+/**
+ * Verify that there are no ordered extents for a given file range.
+ *
+ * @inode: The target inode.
+ * @start: Start offset of the file range, should be sector size aligned.
+ * @end: End offset (inclusive) of the file range, its value +1 should be
+ * sector size aligned.
+ *
+ * This should typically be used for cases where we locked an inode's VFS lock in
+ * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
+ * we have flushed all delalloc in the range, we have waited for all ordered
+ * extents in the range to complete and finally we have locked the file range in
+ * the inode's io_tree.
+ */
+void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_ordered_extent *ordered;
+
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
+ if (ordered) {
+ btrfs_err(root->fs_info,
+"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
+ start, end, btrfs_ino(inode), root->root_key.objectid,
+ ordered->file_offset,
+ ordered->file_offset + ordered->num_bytes - 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ ASSERT(ordered == NULL);
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -11350,16 +11322,13 @@ static const struct file_operations btrfs_dir_file_operations = {
* For now we're avoiding this by dropping bmap.
*/
static const struct address_space_operations btrfs_aops = {
- .readpage = btrfs_readpage,
- .writepage = btrfs_writepage,
+ .read_folio = btrfs_read_folio,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
.invalidate_folio = btrfs_invalidate_folio,
- .releasepage = btrfs_releasepage,
-#ifdef CONFIG_MIGRATION
- .migratepage = btrfs_migratepage,
-#endif
+ .release_folio = btrfs_release_folio,
+ .migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 238cee5b5254..d5dd8bed1488 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -468,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_device *device;
- struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
@@ -498,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
- if (!device->bdev)
+ if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
continue;
- q = bdev_get_queue(device->bdev);
- if (blk_queue_discard(q)) {
- num_devices++;
- minlen = min_t(u64, q->limits.discard_granularity,
- minlen);
- }
+ num_devices++;
+ minlen = min_t(u64, bdev_discard_granularity(device->bdev),
+ minlen);
}
rcu_read_unlock();
@@ -544,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
+/*
+ * Calculate the number of transaction items to reserve for creating a subvolume
+ * or snapshot, not including the inode, directory entries, or parent directory.
+ */
+static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+{
+ /*
+ * 1 to add root block
+ * 1 to add root item
+ * 1 to add root ref
+ * 1 to add root backref
+ * 1 to add UUID item
+ * 1 to add qgroup info
+ * 1 to add qgroup limit
+ *
+ * Ideally the last two would only be accounted if qgroups are enabled,
+ * but that can change between now and the time we would insert them.
+ */
+ unsigned int num_items = 7;
+
+ if (inherit) {
+ /* 2 to add qgroup relations for each inherited qgroup */
+ num_items += 2 * inherit->num_qgroups;
+ }
+ return num_items;
+}
+
static noinline int create_subvol(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
- const char *name, int namelen,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -559,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec64 cur_time = current_time(dir);
- struct inode *inode;
+ struct btrfs_new_inode_args new_inode_args = {
+ .dir = dir,
+ .dentry = dentry,
+ .subvol = true,
+ };
+ unsigned int trans_num_items;
int ret;
- dev_t anon_dev = 0;
+ dev_t anon_dev;
u64 objectid;
- u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -571,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
- goto fail_free;
-
- ret = get_anon_bdev(&anon_dev);
- if (ret < 0)
- goto fail_free;
+ goto out_root_item;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
@@ -583,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
*/
if (btrfs_qgroup_level(objectid)) {
ret = -ENOSPC;
- goto fail_free;
+ goto out_root_item;
}
+ ret = get_anon_bdev(&anon_dev);
+ if (ret < 0)
+ goto out_root_item;
+
+ new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+ if (!new_inode_args.inode) {
+ ret = -ENOMEM;
+ goto out_anon_dev;
+ }
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
+ goto out_inode;
+ trans_num_items += create_subvol_num_items(inherit);
+
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
- /*
- * The same as the snapshot creation, please see the comment
- * of create_snapshot().
- */
- ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+ trans_num_items, false);
if (ret)
- goto fail_free;
+ goto out_new_inode_args;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv);
- goto fail_free;
+ goto out_new_inode_args;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
- goto fail;
+ goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
- goto fail;
+ goto out;
}
btrfs_mark_buffer_dirty(leaf);
@@ -667,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
btrfs_tree_unlock(leaf);
btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
- goto fail;
+ goto out;
}
free_extent_buffer(leaf);
leaf = NULL;
- key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
- free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- /* Freeing will be done in btrfs_put_root() of new_root */
+ /* anon_dev is owned by new_root now. */
anon_dev = 0;
+ BTRFS_I(new_inode_args.inode)->root = new_root;
+ /* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
- btrfs_put_root(new_root);
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
- btrfs_put_root(new_root);
- if (ret) {
- /* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- /*
- * insert the directory item
- */
- ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
- ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
- BTRFS_FT_DIR, index);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
- ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+ ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
- btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto fail;
+ goto out;
}
- ret = btrfs_uuid_tree_add(trans, root_item->uuid,
- BTRFS_UUID_KEY_SUBVOL, objectid);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ d_instantiate_new(dentry, new_inode_args.inode);
+ new_inode_args.inode = NULL;
-fail:
- kfree(root_item);
+out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
@@ -744,18 +748,14 @@ fail:
btrfs_end_transaction(trans);
else
ret = btrfs_commit_transaction(trans);
-
- if (!ret) {
- inode = btrfs_lookup_dentry(dir, dentry);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
- d_instantiate(dentry, inode);
- }
- return ret;
-
-fail_free:
+out_new_inode_args:
+ btrfs_new_inode_args_destroy(&new_inode_args);
+out_inode:
+ iput(new_inode_args.inode);
+out_anon_dev:
if (anon_dev)
free_anon_bdev(anon_dev);
+out_root_item:
kfree(root_item);
return ret;
}
@@ -767,6 +767,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
+ unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
int ret;
@@ -804,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
- * 1 - parent dir inode
- * 2 - dir entries
- * 1 - root item
- * 2 - root ref/backref
- * 1 - root of snapshot
- * 1 - UUID item
+ * 1 to add dir item
+ * 1 to add dir index
+ * 1 to update parent inode item
*/
+ trans_num_items = create_subvol_num_items(inherit) + 3;
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
- &pending_snapshot->block_rsv, 8,
- false);
+ &pending_snapshot->block_rsv,
+ trans_num_items, false);
if (ret)
goto free_pending;
@@ -983,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (snap_src)
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
+ error = create_subvol(mnt_userns, dir, dentry, inherit);
if (!error)
fsnotify_mkdir(dir, dentry);
@@ -1219,10 +1218,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* get the big lock and read metadata off disk */
if (!locked)
- lock_extent_bits(io_tree, start, end, &cached);
+ lock_extent(io_tree, start, end, &cached);
em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
- unlock_extent_cached(io_tree, start, end, &cached);
+ unlock_extent(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -1231,16 +1230,18 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
return em;
}
-static u32 get_extent_max_capacity(const struct extent_map *em)
+static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
+ const struct extent_map *em)
{
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
return BTRFS_MAX_COMPRESSED;
- return BTRFS_MAX_EXTENT_SIZE;
+ return fs_info->max_extent_size;
}
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
- bool locked)
+ u32 extent_thresh, u64 newer_than, bool locked)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *next;
bool ret = false;
@@ -1249,11 +1250,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
return false;
/*
- * We want to check if the next extent can be merged with the current
- * one, which can be an extent created in a past generation, so we pass
- * a minimum generation of 0 to defrag_lookup_extent().
+ * Here we need to pass @newer_then when checking the next extent, or
+ * we will hit a case we mark current extent for defrag, but the next
+ * one will not be a target.
+ * This will just cause extra IO without really reducing the fragments.
*/
- next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
+ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
/* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
goto out;
@@ -1263,8 +1265,15 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
* If the next extent is at its max capacity, defragging current extent
* makes no sense, as the total number of extents won't change.
*/
- if (next->len >= get_extent_max_capacity(em))
+ if (next->len >= get_extent_max_capacity(fs_info, em))
goto out;
+ /* Skip older extent */
+ if (next->generation < newer_than)
+ goto out;
+ /* Also check extent size */
+ if (next->len >= extent_thresh)
+ goto out;
+
ret = true;
out:
free_extent_map(next);
@@ -1324,10 +1333,10 @@ again:
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+ lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent_cached(&inode->io_tree, page_start, page_end,
- &cached_state);
+ unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
if (!ordered)
break;
@@ -1351,7 +1360,7 @@ again:
* make it uptodate.
*/
if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (page->mapping != mapping || !PagePrivate(page)) {
unlock_page(page);
@@ -1393,6 +1402,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
bool locked, struct list_head *target_list,
u64 *last_scanned_ret)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
bool last_is_target = false;
u64 cur = start;
int ret = 0;
@@ -1409,8 +1419,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (!em)
break;
- /* Skip hole/inline/preallocated extents */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+ /*
+ * If the file extent is an inlined one, we may still want to
+ * defrag it (fallthrough) if it will cause a regular extent.
+ * This is for users who want to convert inline extents to
+ * regular ones through max_inline= mount option.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE &&
+ em->len <= inode->root->fs_info->max_inline)
+ goto next;
+
+ /* Skip hole/delalloc/preallocated extents */
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ em->block_start == EXTENT_MAP_DELALLOC ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
goto next;
@@ -1466,11 +1487,20 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* Skip extents already at its max capacity, this is mostly for
* compressed extents, which max cap is only 128K.
*/
- if (em->len >= get_extent_max_capacity(em))
+ if (em->len >= get_extent_max_capacity(fs_info, em))
goto next;
+ /*
+ * Normally there are no more extents after an inline one, thus
+ * @next_mergeable will normally be false and not defragged.
+ * So if an inline extent passed all above checks, just add it
+ * for defrag, and be converted to regular extents.
+ */
+ if (em->block_start == EXTENT_MAP_INLINE)
+ goto add;
+
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
- locked);
+ extent_thresh, newer_than, locked);
if (!next_mergeable) {
struct defrag_target_range *last;
@@ -1586,7 +1616,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
return ret;
clear_extent_bit(&inode->io_tree, start, start + len - 1,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 0, 0, cached_state);
+ EXTENT_DEFRAG, cached_state);
set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
/* Update the page status */
@@ -1636,9 +1666,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
wait_on_page_writeback(pages[i]);
/* Lock the pages range */
- lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
/*
* Now we have a consistent view about the extent map, re-check
* which range really needs to be defragged.
@@ -1664,9 +1694,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
kfree(entry);
}
unlock_extent:
- unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+ (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ &cached_state);
free_pages:
for (i = 0; i < nr_pages; i++) {
if (pages[i]) {
@@ -2557,7 +2587,12 @@ static noinline int search_ioctl(struct inode *inode,
while (1) {
ret = -EFAULT;
- if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
+ /*
+ * Ensure that the whole user buffer is faulted in at sub-page
+ * granularity, otherwise the loop may live-lock.
+ */
+ if (fault_in_subpage_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset))
break;
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@ -2585,7 +2620,7 @@ err:
static noinline int btrfs_ioctl_tree_search(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
size_t buf_size;
@@ -2593,8 +2628,6 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- uargs = (struct btrfs_ioctl_search_args __user *)argp;
-
if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
return -EFAULT;
@@ -2617,7 +2650,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
- struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
size_t buf_size;
@@ -2627,7 +2660,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
return -EPERM;
/* copy search header and buffer size */
- uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
if (copy_from_user(&args, uarg, sizeof(args)))
return -EFAULT;
@@ -4214,26 +4246,6 @@ out:
return ret;
}
-static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
-{
- struct btrfs_data_container *inodes = ctx;
- const size_t c = 3 * sizeof(u64);
-
- if (inodes->bytes_left >= c) {
- inodes->bytes_left -= c;
- inodes->val[inodes->elem_cnt] = inum;
- inodes->val[inodes->elem_cnt + 1] = offset;
- inodes->val[inodes->elem_cnt + 2] = root;
- inodes->elem_cnt += 3;
- } else {
- inodes->bytes_missing += c - inodes->bytes_left;
- inodes->bytes_left = 0;
- inodes->elem_missed += 3;
- }
-
- return 0;
-}
-
static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
void __user *arg, int version)
{
@@ -4283,7 +4295,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
}
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- build_ino_list, inodes, ignore_offset);
+ inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4326,19 +4338,81 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->balance_lock);
}
+/**
+ * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
+ * required.
+ *
+ * @fs_info: the filesystem
+ * @excl_acquired: ptr to boolean value which is set to false in case balance
+ * is being resumed
+ *
+ * Return 0 on success in which case both fs_info::balance is acquired as well
+ * as exclusive ops are blocked. In case of failure return an error code.
+ */
+static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired)
+{
+ int ret;
+
+ /*
+ * Exclusive operation is locked. Three possibilities:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
+ while (1) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ *excl_acquired = true;
+ mutex_lock(&fs_info->balance_mutex);
+ return 0;
+ }
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* This is either (2) or (3) */
+ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (2) */
+ ret = -EINPROGRESS;
+ goto out_failure;
+
+ } else {
+ mutex_unlock(&fs_info->balance_mutex);
+ /*
+ * Lock released to allow other waiters to
+ * continue, we'll reexamine the status again.
+ */
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
+ /* This is (3) */
+ *excl_acquired = false;
+ return 0;
+ }
+ }
+ } else {
+ /* This is (1) */
+ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ goto out_failure;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ }
+
+out_failure:
+ mutex_unlock(&fs_info->balance_mutex);
+ *excl_acquired = false;
+ return ret;
+}
+
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
- bool need_unlock; /* for mut. excl. ops lock */
+ bool need_unlock = true;
int ret;
- if (!arg)
- btrfs_warn(fs_info,
- "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4346,106 +4420,55 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
-again:
- if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
- mutex_lock(&fs_info->balance_mutex);
- need_unlock = true;
- goto locked;
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ bargs = NULL;
+ goto out;
}
- /*
- * mut. excl. ops lock is locked. Three possibilities:
- * (1) some other op is running
- * (2) balance is running
- * (3) balance is paused -- special case (think resume)
- */
- mutex_lock(&fs_info->balance_mutex);
- if (fs_info->balance_ctl) {
- /* this is either (2) or (3) */
- if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- mutex_unlock(&fs_info->balance_mutex);
- /*
- * Lock released to allow other waiters to continue,
- * we'll reexamine the status again.
- */
- mutex_lock(&fs_info->balance_mutex);
-
- if (fs_info->balance_ctl &&
- !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- /* this is (3) */
- need_unlock = false;
- goto locked;
- }
-
- mutex_unlock(&fs_info->balance_mutex);
- goto again;
- } else {
- /* this is (2) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = -EINPROGRESS;
- goto out;
- }
- } else {
- /* this is (1) */
- mutex_unlock(&fs_info->balance_mutex);
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ ret = btrfs_try_lock_balance(fs_info, &need_unlock);
+ if (ret)
goto out;
- }
-locked:
+ lockdep_assert_held(&fs_info->balance_mutex);
- if (arg) {
- bargs = memdup_user(arg, sizeof(*bargs));
- if (IS_ERR(bargs)) {
- ret = PTR_ERR(bargs);
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
goto out_unlock;
}
- if (bargs->flags & BTRFS_BALANCE_RESUME) {
- if (!fs_info->balance_ctl) {
- ret = -ENOTCONN;
- goto out_bargs;
- }
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+ btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
- bctl = fs_info->balance_ctl;
- spin_lock(&fs_info->balance_lock);
- bctl->flags |= BTRFS_BALANCE_RESUME;
- spin_unlock(&fs_info->balance_lock);
- btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
+ goto do_balance;
+ }
- goto do_balance;
- }
- } else {
- bargs = NULL;
+ if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_unlock;
}
if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
- goto out_bargs;
+ goto out_unlock;
}
bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
- goto out_bargs;
- }
-
- if (arg) {
- memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
- memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
- memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
-
- bctl->flags = bargs->flags;
- } else {
- /* balance everything - no filters */
- bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ goto out_unlock;
}
- if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
- ret = -EINVAL;
- goto out_bctl;
- }
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+ bctl->flags = bargs->flags;
do_balance:
/*
* Ownership of bctl and exclusive operation goes to btrfs_balance.
@@ -4458,21 +4481,19 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
- if ((ret == 0 || ret == -ECANCELED) && arg) {
+ if (ret == 0 || ret == -ECANCELED) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
-out_bctl:
kfree(bctl);
-out_bargs:
- kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
+ kfree(bargs);
return ret;
}
@@ -5448,8 +5469,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_fs_info(fs_info, argp);
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
- case BTRFS_IOC_BALANCE:
- return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(inode, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 313d9d685adb..0eab3cb274a1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -14,6 +14,93 @@
#include "locking.h"
/*
+ * Lockdep class keys for extent_buffer->lock's in this root. For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets. As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
+ *
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock. As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
+ *
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked. It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
+ *
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if BTRFS_MAX_LEVEL != 8
+#error
+#endif
+
+#define DEFINE_LEVEL(stem, level) \
+ .names[level] = "btrfs-" stem "-0" #level,
+
+#define DEFINE_NAME(stem) \
+ DEFINE_LEVEL(stem, 0) \
+ DEFINE_LEVEL(stem, 1) \
+ DEFINE_LEVEL(stem, 2) \
+ DEFINE_LEVEL(stem, 3) \
+ DEFINE_LEVEL(stem, 4) \
+ DEFINE_LEVEL(stem, 5) \
+ DEFINE_LEVEL(stem, 6) \
+ DEFINE_LEVEL(stem, 7)
+
+static struct btrfs_lockdep_keyset {
+ u64 id; /* root objectid */
+ /* Longest entry: btrfs-free-space-00 */
+ char names[BTRFS_MAX_LEVEL][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL];
+} btrfs_lockdep_keysets[] = {
+ { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
+ { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
+ { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
+ { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = 0, DEFINE_NAME("tree") },
+};
+
+#undef DEFINE_LEVEL
+#undef DEFINE_NAME
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level)
+{
+ struct btrfs_lockdep_keyset *ks;
+
+ BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+ /* Find the matching keyset, id 0 is the default entry */
+ for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+ if (ks->id == objectid)
+ break;
+
+ lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]);
+}
+
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
+{
+ if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
+ btrfs_set_buffer_lockdep_class(root->root_key.objectid,
+ eb, btrfs_header_level(eb));
+}
+
+#endif
+
+/*
* Extent buffer locking
* =====================
*
@@ -45,7 +132,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
start_ns = ktime_get_ns();
down_read_nested(&eb->lock, nest);
- eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
@@ -62,7 +148,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
- eb->lock_owner = current->pid;
trace_btrfs_try_tree_read_lock(eb);
return 1;
}
@@ -90,7 +175,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
- eb->lock_owner = 0;
up_read(&eb->lock);
}
@@ -167,6 +251,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
while (1) {
eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
btrfs_tree_lock(eb);
if (eb == root->node)
break;
@@ -188,6 +274,8 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
while (1) {
eb = btrfs_root_node(root);
+
+ btrfs_maybe_reset_lockdep_class(root, eb);
btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
@@ -198,6 +286,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
}
/*
+ * Loop around taking references on and locking the root node of the tree in
+ * nowait mode until we end up with a lock on the root node or returning to
+ * avoid blocking.
+ *
+ * Return: root extent buffer with read lock held or -EAGAIN.
+ */
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ if (!btrfs_try_tree_read_lock(eb)) {
+ free_extent_buffer(eb);
+ return ERR_PTR(-EAGAIN);
+ }
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
* DREW locks
* ==========
*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bbc45534ae9a..490c7a79e995 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -94,6 +94,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
@@ -131,4 +132,18 @@ void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level);
+void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb);
+#else
+static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
+ struct extent_buffer *eb, int level)
+{
+}
+static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root,
+ struct extent_buffer *eb)
+{
+}
+#endif
+
#endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 430ad36b8b08..89bc5f825e0a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -155,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
out_pages[*cur_out / PAGE_SIZE] = cur_page;
}
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
write_compress_length(kaddr + offset_in_page(*cur_out),
compressed_size);
*cur_out += LZO_LEN;
@@ -167,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
orig_out + compressed_size - *cur_out);
- kunmap(cur_page);
+ kunmap_local(kaddr);
if ((*cur_out / PAGE_SIZE) >= max_nr_page)
return -E2BIG;
@@ -180,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
}
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
memcpy(kaddr + offset_in_page(*cur_out),
compressed_data + *cur_out - orig_out, copy_len);
@@ -202,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
*cur_out += sector_bytes_left;
out:
- kunmap(cur_page);
+ kunmap_local(kaddr);
return 0;
}
@@ -248,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap(page_in);
+ data_in = kmap_local_page(page_in);
ret = lzo1x_1_compress(data_in +
offset_in_page(cur_in), in_len,
workspace->cbuf, &out_len,
workspace->mem);
- kunmap(page_in);
+ kunmap_local(data_in);
if (ret < 0) {
pr_debug("BTRFS: lzo in loop returned %d\n", ret);
ret = -EIO;
@@ -310,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb,
u32 orig_in = *cur_in;
while (*cur_in < orig_in + len) {
- char *kaddr;
struct page *cur_page;
u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in),
orig_in + len - *cur_in);
@@ -318,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb,
ASSERT(copy_len);
cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE];
- kaddr = kmap(cur_page);
- memcpy(dest + *cur_in - orig_in,
- kaddr + offset_in_page(*cur_in),
- copy_len);
- kunmap(cur_page);
+ memcpy_from_page(dest + *cur_in - orig_in, cur_page,
+ offset_in_page(*cur_in), copy_len);
*cur_in += copy_len;
}
@@ -342,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Bytes decompressed so far */
u32 cur_out = 0;
- kaddr = kmap(cb->compressed_pages[0]);
+ kaddr = kmap_local_page(cb->compressed_pages[0]);
len_in = read_compress_length(kaddr);
- kunmap(cb->compressed_pages[0]);
+ kunmap_local(kaddr);
cur_in += LZO_LEN;
/*
@@ -378,9 +374,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
(cur_in + LZO_LEN - 1) / sectorsize);
cur_page = cb->compressed_pages[cur_in / PAGE_SIZE];
ASSERT(cur_page);
- kaddr = kmap(cur_page);
+ kaddr = kmap_local_page(cur_page);
seg_len = read_compress_length(kaddr + offset_in_page(cur_in));
- kunmap(cur_page);
+ kunmap_local(kaddr);
cur_in += LZO_LEN;
if (seg_len > WORKSPACE_CBUF_LENGTH) {
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 340f995652f2..f9850edfd726 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -88,6 +88,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
return NULL;
}
+/*
+ * Search @root from an entry that starts or comes after @bytenr.
+ *
+ * @root: the root to search.
+ * @bytenr: bytenr to search from.
+ *
+ * Return the rb_node that start at or after @bytenr. If there is no entry at
+ * or after @bytner return NULL.
+ */
+static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+ u64 bytenr)
+{
+ struct rb_node *node = root->rb_node, *ret = NULL;
+ struct rb_simple_node *entry, *ret_entry = NULL;
+
+ while (node) {
+ entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+ if (bytenr < entry->bytenr) {
+ if (!ret || entry->bytenr < ret_entry->bytenr) {
+ ret = node;
+ ret_entry = entry;
+ }
+
+ node = node->rb_left;
+ } else if (bytenr > entry->bytenr) {
+ node = node->rb_right;
+ } else {
+ return node;
+ }
+ }
+
+ return ret;
+}
+
static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
struct rb_node *node)
{
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1957b14b329a..e54f8280031f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -272,25 +272,30 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
spin_unlock_irq(&tree->lock);
}
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+ struct btrfs_ordered_extent *ordered_extent;
+
+ ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+ btrfs_finish_ordered_io(ordered_extent);
+}
+
/*
* Mark all ordered extents io inside the specified range finished.
*
- * @page: The invovled page for the opeartion.
+ * @page: The involved page for the operation.
* For uncompressed buffered IO, the page status also needs to be
* updated to indicate whether the pending ordered io is finished.
* Can be NULL for direct IO and compressed write.
* For these cases, callers are ensured they won't execute the
* endio function twice.
- * @finish_func: The function to be executed when all the IO of an ordered
- * extent are finished.
*
* This function is called for endio, thus the range must have ordered
- * extent(s) coveri it.
+ * extent(s) covering it.
*/
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- struct page *page, u64 file_offset,
- u64 num_bytes, btrfs_func_t finish_func,
- bool uptodate)
+ struct page *page, u64 file_offset,
+ u64 num_bytes, bool uptodate)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -401,8 +406,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
cond_wake_up(&entry->wait);
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_mark_finished(inode, entry);
spin_unlock_irqrestore(&tree->lock, flags);
- btrfs_init_work(&entry->work, finish_func, NULL, NULL);
+ btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
btrfs_queue_work(wq, &entry->work);
spin_lock_irqsave(&tree->lock, flags);
}
@@ -473,6 +479,7 @@ out:
if (finished && cached && entry) {
*cached = entry;
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
}
spin_unlock_irqrestore(&tree->lock, flags);
return finished;
@@ -517,7 +524,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
bool pending;
+ bool freespace_inode;
+
+ /*
+ * If this is a free space inode the thread has not acquired the ordered
+ * extents lockdep map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
@@ -573,6 +588,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
}
}
+ btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
@@ -587,6 +604,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
}
spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
+ if (!freespace_inode)
+ btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
}
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
@@ -705,10 +724,17 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ bool freespace_inode;
trace_btrfs_ordered_extent_start(inode, entry);
/*
+ * If this is a free space inode do not take the ordered extents lockdep
+ * map.
+ */
+ freespace_inode = btrfs_is_free_space_inode(inode);
+
+ /*
* pages in the range can be dirty, clean or writeback. We
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
@@ -716,6 +742,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
+ if (!freespace_inode)
+ btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
}
@@ -807,8 +835,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup(inode, entry);
+ }
out:
spin_unlock_irqrestore(&tree->lock, flags);
return entry;
@@ -848,8 +878,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
break;
}
out:
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_range(inode, entry);
+ }
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -878,6 +910,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
ASSERT(list_empty(&ordered->log_list));
list_add_tail(&ordered->log_list, list);
refcount_inc(&ordered->refs);
+ trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
}
spin_unlock_irq(&tree->lock);
}
@@ -901,6 +934,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
spin_unlock_irq(&tree->lock);
return entry;
@@ -975,8 +1009,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
/* No ordered extent in the range */
entry = NULL;
out:
- if (entry)
+ if (entry) {
refcount_inc(&entry->refs);
+ trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
+ }
+
spin_unlock_irq(&tree->lock);
return entry;
}
@@ -1006,7 +1043,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
cachedp = cached_state;
while (1) {
- lock_extent_bits(&inode->io_tree, start, end, cachedp);
+ lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1019,12 +1056,37 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
refcount_dec(&cache->refs);
break;
}
- unlock_extent_cached(&inode->io_tree, start, end, cachedp);
+ unlock_extent(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
+/*
+ * Lock the passed range and ensure all pending ordered extents in it are run
+ * to completion in nowait mode.
+ *
+ * Return true if btrfs_lock_ordered_range does not return any extents,
+ * otherwise false.
+ */
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ if (!try_lock_extent(&inode->io_tree, start, end))
+ return false;
+
+ ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
+ if (!ordered)
+ return true;
+
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(&inode->io_tree, start, end, NULL);
+
+ return false;
+}
+
+
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
u64 len)
{
@@ -1055,6 +1117,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret = 0;
+ trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+
spin_lock_irq(&tree->lock);
/* Remove from tree once */
node = &ordered->rb_node;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ecad67a2c745..f59f2dbdb25e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -160,18 +160,6 @@ struct btrfs_ordered_extent {
struct block_device *bdev;
};
-/*
- * calculates the total size you need to allocate for an ordered sum
- * structure spanning 'bytes' in the file
- */
-static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
- unsigned long bytes)
-{
- int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-
- return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
-}
-
static inline void
btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
{
@@ -180,13 +168,14 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
t->last = NULL;
}
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
- u64 num_bytes, btrfs_func_t finish_func,
- bool uptodate);
+ u64 num_bytes, bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
@@ -217,6 +206,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
u64 post);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 1a6d2d5b4b33..055a631276ce 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
struct prop_handler {
struct hlist_node node;
const char *xattr_name;
- int (*validate)(const char *value, size_t len);
+ int (*validate)(const struct btrfs_inode *inode, const char *value,
+ size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
const char *(*extract)(struct inode *inode);
+ bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -55,7 +57,8 @@ find_prop_handler(const char *name,
return NULL;
}
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len)
{
const struct prop_handler *handler;
@@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len)
if (value_len == 0)
return 0;
- return handler->validate(value, value_len);
+ return handler->validate(inode, value, value_len);
+}
+
+/*
+ * Check if a property should be ignored (not set) for an inode.
+ *
+ * @inode: The target inode.
+ * @name: The property's name.
+ *
+ * The caller must be sure the given property name is valid, for example by
+ * having previously called btrfs_validate_prop().
+ *
+ * Returns: true if the property should be ignored for the given inode
+ * false if the property must not be ignored for the given inode
+ */
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
+{
+ const struct prop_handler *handler;
+
+ handler = find_prop_handler(name, NULL);
+ ASSERT(handler != NULL);
+
+ return handler->ignore(inode);
}
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
@@ -245,15 +270,16 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 ino = btrfs_ino(BTRFS_I(inode));
- int ret;
-
- ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
- return ret;
+ return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
}
-static int prop_compression_validate(const char *value, size_t len)
+static int prop_compression_validate(const struct btrfs_inode *inode,
+ const char *value, size_t len)
{
+ if (!btrfs_inode_can_compress(inode))
+ return -EINVAL;
+
if (!value)
return 0;
@@ -310,6 +336,22 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return 0;
}
+static bool prop_compression_ignore(const struct btrfs_inode *inode)
+{
+ /*
+ * Compression only has effect for regular files, and for directories
+ * we set it just to propagate it to new files created inside them.
+ * Everything else (symlinks, devices, sockets, fifos) is pointless as
+ * it will do nothing, so don't waste metadata space on a compression
+ * xattr for anything that is neither a file nor a directory.
+ */
+ if (!S_ISREG(inode->vfs_inode.i_mode) &&
+ !S_ISDIR(inode->vfs_inode.i_mode))
+ return true;
+
+ return false;
+}
+
static const char *prop_compression_extract(struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
@@ -330,13 +372,13 @@ static struct prop_handler prop_handlers[] = {
.validate = prop_compression_validate,
.apply = prop_compression_apply,
.extract = prop_compression_extract,
+ .ignore = prop_compression_ignore,
.inheritable = 1
},
};
-static int inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *parent)
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -356,6 +398,9 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
+ if (h->ignore(BTRFS_I(inode)))
+ continue;
+
value = h->extract(parent);
if (!value)
continue;
@@ -364,7 +409,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(value, strlen(value));
+ ret = h->validate(BTRFS_I(inode), value, strlen(value));
if (ret)
continue;
@@ -408,41 +453,6 @@ static int inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
-int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- struct inode *dir)
-{
- if (!dir)
- return 0;
-
- return inherit_props(trans, inode, dir);
-}
-
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root)
-{
- struct super_block *sb = root->fs_info->sb;
- struct inode *parent_inode, *child_inode;
- int ret;
-
- parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root);
- if (IS_ERR(parent_inode))
- return PTR_ERR(parent_inode);
-
- child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root);
- if (IS_ERR(child_inode)) {
- iput(parent_inode);
- return PTR_ERR(child_inode);
- }
-
- ret = inherit_props(trans, child_inode, parent_inode);
- iput(child_inode);
- iput(parent_inode);
-
- return ret;
-}
-
void __init btrfs_props_init(void)
{
int i;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 40b2c65b518c..ca9dd3df129b 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -13,7 +13,9 @@ void __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
-int btrfs_validate_prop(const char *name, const char *value, size_t value_len);
+int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
+ const char *value, size_t value_len);
+bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
@@ -21,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *dir);
-int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_root *parent_root);
-
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1866b1f0da01..9334c3157c22 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -275,7 +275,7 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
}
/*
- * Add relation specified by two qgoup ids.
+ * Add relation specified by two qgroup ids.
*
* Must be called with qgroup_lock held.
*
@@ -333,6 +333,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
}
#endif
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+{
+ fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
+ BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+}
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -401,7 +408,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
}
if (btrfs_qgroup_status_generation(l, ptr) !=
fs_info->generation) {
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
@@ -419,7 +426,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
btrfs_err(fs_info, "inconsistent qgroup config");
- flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
if (!qgroup) {
qgroup = add_qgroup_rb(fs_info, found_key.offset);
@@ -878,7 +885,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
- btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
@@ -1052,7 +1060,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+ btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
+ BTRFS_QGROUP_STATUS_FLAGS_MASK);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
btrfs_mark_buffer_dirty(leaf);
@@ -1174,6 +1183,21 @@ out_add_root:
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
@@ -1255,6 +1279,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -1717,7 +1742,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
ret = update_qgroup_limit_item(trans, qgroup);
if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_info(fs_info, "unable to update quota limit for %llu",
qgroupid);
}
@@ -1790,10 +1815,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*/
ASSERT(trans != NULL);
+ if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ return 0;
+
ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
true);
if (ret < 0) {
- trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(trans->fs_info);
btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
@@ -2269,7 +2297,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2280,6 +2308,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
int level;
+ u8 drop_subptree_thres;
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
@@ -2289,8 +2318,25 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
+ spin_lock(&fs_info->qgroup_lock);
+ drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * This function only gets called for snapshot drop, if we hit a high
+ * node here, it means we are going to change ownership for quite a lot
+ * of extents, which will greatly slow down btrfs_commit_transaction().
+ *
+ * So here if we find a high tree here, we just skip the accounting and
+ * mark qgroup inconsistent.
+ */
+ if (root_level >= drop_subptree_thres) {
+ qgroup_mark_inconsistent(fs_info);
+ return 0;
+ }
+
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
+ ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
@@ -2604,7 +2650,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
* If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
goto out_free;
if (new_roots) {
@@ -2700,7 +2747,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
- if (!ret) {
+ if (!ret && !(fs_info->qgroup_flags &
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
/*
* Old roots should be searched when inserting qgroup
* extent record
@@ -2773,12 +2821,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
spin_lock(&fs_info->qgroup_lock);
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
@@ -2789,7 +2835,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -2907,7 +2953,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
ret = update_qgroup_limit_item(trans, dstgroup);
if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
btrfs_info(fs_info,
"unable to update quota limit for %llu",
dstgroup->qgroupid);
@@ -3013,7 +3059,7 @@ out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -3286,7 +3332,8 @@ static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
- !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3351,7 +3398,8 @@ out:
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!stopped)
+ if (!stopped ||
+ fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
ret = update_qgroup_status_item(trans);
@@ -3362,6 +3410,7 @@ out:
}
}
fs_info->qgroup_rescan_running = false;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
complete_all(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3372,6 +3421,8 @@ out:
if (stopped) {
btrfs_info(fs_info, "qgroup scan paused");
+ } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
+ btrfs_info(fs_info, "qgroup scan cancelled");
} else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
@@ -3434,6 +3485,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
+ fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3939,12 +3992,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce)
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush)
{
int ret;
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
- if (ret <= 0 && ret != -EDQUOT)
+ if ((ret <= 0 && ret != -EDQUOT) || noflush)
return ret;
ret = try_flush_qgroup(root);
@@ -4230,8 +4284,7 @@ out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- fs_info->qgroup_flags |=
- BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
return ret;
}
@@ -4318,7 +4371,7 @@ out:
btrfs_err_rl(fs_info,
"failed to account subtree at bytenr %llu: %d",
subvol_eb->start, ret);
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ qgroup_mark_inconsistent(fs_info);
}
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 880e9df0dac1..578c77e94200 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -100,6 +100,9 @@
* subtree rescan for them.
*/
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
+
/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
@@ -364,19 +367,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce);
+ enum btrfs_qgroup_rsv_type type, bool enforce,
+ bool noflush);
/* Reserve metadata space for pertrans and prealloc type */
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ enforce, false);
}
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
- int num_bytes, bool enforce)
+ int num_bytes, bool enforce,
+ bool noflush)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
- BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ enforce, noflush);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0e239a4c3b26..f6395e8288d6 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -52,132 +52,21 @@ struct btrfs_stripe_hash_table {
struct btrfs_stripe_hash table[];
};
-enum btrfs_rbio_ops {
- BTRFS_RBIO_WRITE,
- BTRFS_RBIO_READ_REBUILD,
- BTRFS_RBIO_PARITY_SCRUB,
- BTRFS_RBIO_REBUILD_MISSING,
-};
-
-struct btrfs_raid_bio {
- struct btrfs_io_context *bioc;
-
- /* while we're doing rmw on a stripe
- * we put it into a hash table so we can
- * lock the stripe and merge more rbios
- * into it.
- */
- struct list_head hash_list;
-
- /*
- * LRU list for the stripe cache
- */
- struct list_head stripe_cache;
-
- /*
- * for scheduling work in the helper threads
- */
- struct btrfs_work work;
-
- /*
- * bio list and bio_list_lock are used
- * to add more bios into the stripe
- * in hopes of avoiding the full rmw
- */
- struct bio_list bio_list;
- spinlock_t bio_list_lock;
-
- /* also protected by the bio_list_lock, the
- * plug list is used by the plugging code
- * to collect partial bios while plugged. The
- * stripe locking code also uses it to hand off
- * the stripe lock to the next pending IO
- */
- struct list_head plug_list;
-
- /*
- * flags that tell us if it is safe to
- * merge with this bio
- */
- unsigned long flags;
-
- /* size of each individual stripe on disk */
- int stripe_len;
-
- /* number of data stripes (no p/q) */
- int nr_data;
-
- int real_stripes;
-
- int stripe_npages;
- /*
- * set if we're doing a parity rebuild
- * for a read from higher up, which is handled
- * differently from a parity rebuild as part of
- * rmw
- */
- enum btrfs_rbio_ops operation;
-
- /* first bad stripe */
- int faila;
-
- /* second bad stripe (for raid6 use) */
- int failb;
-
- int scrubp;
- /*
- * number of pages needed to represent the full
- * stripe
- */
- int nr_pages;
-
- /*
- * size of all the bios in the bio_list. This
- * helps us decide if the rbio maps to a full
- * stripe or not
- */
- int bio_list_bytes;
-
- int generic_bio_cnt;
-
- refcount_t refs;
-
- atomic_t stripes_pending;
-
- atomic_t error;
- /*
- * these are two arrays of pointers. We allocate the
- * rbio big enough to hold them both and setup their
- * locations when the rbio is allocated
- */
-
- /* pointers to pages that we allocated for
- * reading/writing stripes directly from the disk (including P/Q)
- */
- struct page **stripe_pages;
-
- /*
- * pointers to the pages in the bio_list. Stored
- * here for faster lookup
- */
- struct page **bio_pages;
-
- /*
- * bitmap to record which horizontal stripe has data
- */
- unsigned long *dbitmap;
-
- /* allocated with real_stripes-many pointers for finish_*() calls */
- void **finish_pointers;
-
- /* allocated with stripe_npages-many bits for finish_*() calls */
- unsigned long *finish_pbitmap;
+/*
+ * A bvec like structure to present a sector inside a page.
+ *
+ * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ */
+struct sector_ptr {
+ struct page *page;
+ unsigned int pgoff:24;
+ unsigned int uptodate:8;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct btrfs_work *work);
-static void read_rebuild_work(struct btrfs_work *work);
+static void rmw_work(struct work_struct *work);
+static void read_rebuild_work(struct work_struct *work);
static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
static void __free_raid_bio(struct btrfs_raid_bio *rbio);
@@ -186,12 +75,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check);
-static void scrub_parity_work(struct btrfs_work *work);
+static void scrub_parity_work(struct work_struct *work);
-static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
+static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
{
- btrfs_init_work(&rbio->work, work_func, NULL, NULL);
- btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
+ INIT_WORK(&rbio->work, work_func);
+ queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
}
/*
@@ -239,7 +128,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
/*
* caching an rbio means to copy anything from the
- * bio_pages array into the stripe_pages array. We
+ * bio_sectors array into the stripe_pages array. We
* use the page uptodate bit in the stripe cache array
* to indicate if it has valid data
*
@@ -255,12 +144,18 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
if (ret)
return;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (!rbio->bio_pages[i])
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ /* Some range not covered by bio (partial write), skip it */
+ if (!rbio->bio_sectors[i].page)
continue;
- copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
- SetPageUptodate(rbio->stripe_pages[i]);
+ ASSERT(rbio->stripe_sectors[i].page);
+ memcpy_page(rbio->stripe_sectors[i].page,
+ rbio->stripe_sectors[i].pgoff,
+ rbio->bio_sectors[i].page,
+ rbio->bio_sectors[i].pgoff,
+ rbio->bioc->fs_info->sectorsize);
+ rbio->stripe_sectors[i].uptodate = 1;
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
}
@@ -283,32 +178,86 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio)
return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
+static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
+ unsigned int page_nr)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ ASSERT(page_nr < rbio->nr_pages);
+
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page;
+ i++) {
+ if (!rbio->stripe_sectors[i].uptodate)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Update the stripe_sectors[] array to use correct page and pgoff
+ *
+ * Should be called every time any page pointer in stripes_pages[] got modified.
+ */
+static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ u32 offset;
+ int i;
+
+ for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
+ int page_index = offset >> PAGE_SHIFT;
+
+ ASSERT(page_index < rbio->nr_pages);
+ rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
+ rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ }
+}
+
+static void steal_rbio_page(struct btrfs_raid_bio *src,
+ struct btrfs_raid_bio *dest, int page_nr)
+{
+ const u32 sectorsize = src->bioc->fs_info->sectorsize;
+ const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+ int i;
+
+ if (dest->stripe_pages[page_nr])
+ __free_page(dest->stripe_pages[page_nr]);
+ dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
+ src->stripe_pages[page_nr] = NULL;
+
+ /* Also update the sector->uptodate bits. */
+ for (i = sectors_per_page * page_nr;
+ i < sectors_per_page * page_nr + sectors_per_page; i++)
+ dest->stripe_sectors[i].uptodate = true;
+}
+
/*
- * stealing an rbio means taking all the uptodate pages from the stripe
- * array in the source rbio and putting them into the destination rbio
+ * Stealing an rbio means taking all the uptodate pages from the stripe array
+ * in the source rbio and putting them into the destination rbio.
+ *
+ * This will also update the involved stripe_sectors[] which are referring to
+ * the old pages.
*/
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
struct page *s;
- struct page *d;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
for (i = 0; i < dest->nr_pages; i++) {
s = src->stripe_pages[i];
- if (!s || !PageUptodate(s)) {
+ if (!s || !full_page_sectors_uptodate(src, i))
continue;
- }
-
- d = dest->stripe_pages[i];
- if (d)
- __free_page(d);
- dest->stripe_pages[i] = s;
- src->stripe_pages[i] = NULL;
+ steal_rbio_page(src, dest, i);
}
+ index_stripe_sectors(dest);
+ index_stripe_sectors(src);
}
/*
@@ -323,7 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
- dest->generic_bio_cnt += victim->generic_bio_cnt;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
+ dest->stripe_nsectors);
bio_list_init(&victim->bio_list);
}
@@ -522,9 +473,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio)
int ret = 1;
spin_lock_irqsave(&rbio->bio_list_lock, flags);
- if (size != rbio->nr_data * rbio->stripe_len)
+ if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
ret = 0;
- BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+ BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
return ret;
@@ -600,39 +551,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
-static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return stripe * rbio->stripe_npages + index;
+ ASSERT(stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr < rbio->stripe_nsectors);
+
+ return stripe_nr * rbio->stripe_nsectors + sector_nr;
}
-/*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
- int index)
+/* Return a sector from rbio->stripe_sectors, not from the bio list */
+static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int stripe_nr,
+ unsigned int sector_nr)
{
- return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+ return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
+ sector_nr)];
}
-/*
- * helper to index into the pstripe
- */
-static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside P stripe */
+static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
- return rbio_stripe_page(rbio, rbio->nr_data, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
}
-/*
- * helper to index into the qstripe, returns null
- * if there is no qstripe
- */
-static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+/* Grab a sector inside Q stripe, return NULL if not RAID6 */
+static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
+ unsigned int sector_nr)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
- return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
+ return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
}
/*
@@ -862,8 +813,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
- if (rbio->generic_bio_cnt)
- btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -911,47 +866,43 @@ static void raid_write_end_io(struct bio *bio)
rbio_orig_end_io(rbio, err);
}
-/*
- * the read/modify/write code wants to use the original bio for
- * any pages it included, and then use the rbio for everything
- * else. This function decides if a given index (stripe number)
- * and page number in that stripe fall inside the original bio
- * or the rbio.
- *
- * if you set bio_list_only, you'll get a NULL back for any ranges
- * that are outside the bio_list
+/**
+ * Get a sector pointer specified by its @stripe_nr and @sector_nr
*
- * This doesn't take any refs on anything, you get a bare page pointer
- * and the caller must bump refs as required.
+ * @rbio: The raid bio
+ * @stripe_nr: Stripe number, valid range [0, real_stripe)
+ * @sector_nr: Sector number inside the stripe,
+ * valid range [0, stripe_nsectors)
+ * @bio_list_only: Whether to use sectors inside the bio list only.
*
- * You must call index_rbio_pages once before you can trust
- * the answers from this function.
+ * The read/modify/write code wants to reuse the original bio page as much
+ * as possible, and only use stripe_sectors as fallback.
*/
-static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
- int index, int pagenr, int bio_list_only)
+static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
+ int stripe_nr, int sector_nr,
+ bool bio_list_only)
{
- int chunk_page;
- struct page *p = NULL;
+ struct sector_ptr *sector;
+ int index;
- chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+
+ index = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ ASSERT(index >= 0 && index < rbio->nr_sectors);
spin_lock_irq(&rbio->bio_list_lock);
- p = rbio->bio_pages[chunk_page];
+ sector = &rbio->bio_sectors[index];
+ if (sector->page || bio_list_only) {
+ /* Don't return sector without a valid page pointer */
+ if (!sector->page)
+ sector = NULL;
+ spin_unlock_irq(&rbio->bio_list_lock);
+ return sector;
+ }
spin_unlock_irq(&rbio->bio_list_lock);
- if (p || bio_list_only)
- return p;
-
- return rbio->stripe_pages[chunk_page];
-}
-
-/*
- * number of pages we need for the entire stripe across all the
- * drives
- */
-static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
-{
- return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
+ return &rbio->stripe_sectors[index];
}
/*
@@ -959,23 +910,30 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
- struct btrfs_io_context *bioc,
- u64 stripe_len)
-{
+ struct btrfs_io_context *bioc)
+{
+ const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
+ const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
+ const unsigned int num_pages = stripe_npages * real_stripes;
+ const unsigned int stripe_nsectors =
+ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
+ const unsigned int num_sectors = stripe_nsectors * real_stripes;
struct btrfs_raid_bio *rbio;
- int nr_data = 0;
- int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
- int num_pages = rbio_nr_pages(stripe_len, real_stripes);
- int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
+ /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
+ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
+ /*
+ * Our current stripe len should be fixed to 64k thus stripe_nsectors
+ * (at most 16) should be no larger than BITS_PER_LONG.
+ */
+ ASSERT(stripe_nsectors <= BITS_PER_LONG);
+
rbio = kzalloc(sizeof(*rbio) +
sizeof(*rbio->stripe_pages) * num_pages +
- sizeof(*rbio->bio_pages) * num_pages +
- sizeof(*rbio->finish_pointers) * real_stripes +
- sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
- sizeof(*rbio->finish_pbitmap) *
- BITS_TO_LONGS(stripe_npages),
+ sizeof(*rbio->bio_sectors) * num_sectors +
+ sizeof(*rbio->stripe_sectors) * num_sectors +
+ sizeof(*rbio->finish_pointers) * real_stripes,
GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -985,11 +943,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
+ btrfs_get_bioc(bioc);
rbio->bioc = bioc;
- rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
+ rbio->nr_sectors = num_sectors;
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
+ rbio->stripe_nsectors = stripe_nsectors;
rbio->faila = -1;
rbio->failb = -1;
refcount_set(&rbio->refs, 1);
@@ -997,8 +957,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
atomic_set(&rbio->stripes_pending, 0);
/*
- * the stripe_pages, bio_pages, etc arrays point to the extra
- * memory we allocated past the end of the rbio
+ * The stripe_pages, bio_sectors, etc arrays point to the extra memory
+ * we allocated past the end of the rbio.
*/
p = rbio + 1;
#define CONSUME_ALLOC(ptr, count) do { \
@@ -1006,79 +966,76 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
} while (0)
CONSUME_ALLOC(rbio->stripe_pages, num_pages);
- CONSUME_ALLOC(rbio->bio_pages, num_pages);
+ CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
+ CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
- CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
- CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
#undef CONSUME_ALLOC
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
- nr_data = real_stripes - 1;
- else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
- nr_data = real_stripes - 2;
- else
- BUG();
+ ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
+ rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
- rbio->nr_data = nr_data;
return rbio;
}
/* allocate pages for all the stripes in the bio, including parity */
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ int ret;
- for (i = 0; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ if (ret < 0)
+ return ret;
+ /* Mapping all sectors */
+ index_stripe_sectors(rbio);
return 0;
}
/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- struct page *page;
+ const int data_pages = rbio->nr_data * rbio->stripe_npages;
+ int ret;
- i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
+ ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
+ rbio->stripe_pages + data_pages);
+ if (ret < 0)
+ return ret;
- for (; i < rbio->nr_pages; i++) {
- if (rbio->stripe_pages[i])
- continue;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[i] = page;
- }
+ index_stripe_sectors(rbio);
return 0;
}
/*
- * add a single page from a specific stripe into our list of bios for IO
- * this will try to merge into existing bios if possible, and returns
- * zero if all went well.
+ * Add a single sector @sector into our list of bios for IO.
+ *
+ * Return 0 if everything went well.
+ * Return <0 for error.
*/
-static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
- struct bio_list *bio_list,
- struct page *page,
- int stripe_nr,
- unsigned long page_index,
- unsigned long bio_max_len)
-{
+static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
+ struct bio_list *bio_list,
+ struct sector_ptr *sector,
+ unsigned int stripe_nr,
+ unsigned int sector_nr,
+ enum req_op op)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
struct btrfs_io_stripe *stripe;
u64 disk_start;
+ /*
+ * Note: here stripe_nr has taken device replace into consideration,
+ * thus it can be larger than rbio->real_stripe.
+ * So here we check against bioc->num_stripes, not rbio->real_stripes.
+ */
+ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
+ ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT(sector->page);
+
stripe = &rbio->bioc->stripes[stripe_nr];
- disk_start = stripe->physical + (page_index << PAGE_SHIFT);
+ disk_start = stripe->physical + sector_nr * sectorsize;
/* if the device is missing, just fail this stripe */
if (!stripe->dev->bdev)
@@ -1095,20 +1052,21 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
*/
if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, page, PAGE_SIZE, 0);
- if (ret == PAGE_SIZE)
+ ret = bio_add_page(last, sector->page, sectorsize,
+ sector->pgoff);
+ if (ret == sectorsize)
return 0;
}
}
/* put a new bio on the list */
- bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
- btrfs_bio(bio)->device = stripe->dev;
- bio->bi_iter.bi_size = 0;
- bio_set_dev(bio, stripe->dev->bdev);
+ bio = bio_alloc(stripe->dev->bdev,
+ max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
+ op, GFP_NOFS);
bio->bi_iter.bi_sector = disk_start >> 9;
+ bio->bi_private = rbio;
- bio_add_page(bio, page, PAGE_SIZE, 0);
+ bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
bio_list_add(bio_list, bio);
return 0;
}
@@ -1130,6 +1088,29 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
}
}
+static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
+{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ rbio->bioc->raid_map[0];
+
+ bio_for_each_segment(bvec, bio, iter) {
+ u32 bvec_offset;
+
+ for (bvec_offset = 0; bvec_offset < bvec.bv_len;
+ bvec_offset += sectorsize, offset += sectorsize) {
+ int index = offset / sectorsize;
+ struct sector_ptr *sector = &rbio->bio_sectors[index];
+
+ sector->page = bvec.bv_page;
+ sector->pgoff = bvec.bv_offset + bvec_offset;
+ ASSERT(sector->pgoff < PAGE_SIZE);
+ }
+ }
+}
+
/*
* helper function to walk our bio list and populate the bio_pages array with
* the result. This seems expensive, but it is faster than constantly
@@ -1141,29 +1122,40 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
- u64 start;
- unsigned long stripe_offset;
- unsigned long page_index;
spin_lock_irq(&rbio->bio_list_lock);
- bio_list_for_each(bio, &rbio->bio_list) {
- struct bio_vec bvec;
- struct bvec_iter iter;
- int i = 0;
+ bio_list_for_each(bio, &rbio->bio_list)
+ index_one_bio(rbio, bio);
+
+ spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+ struct raid56_bio_trace_info *trace_info)
+{
+ const struct btrfs_io_context *bioc = rbio->bioc;
+ int i;
- start = bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->bioc->raid_map[0];
- page_index = stripe_offset >> PAGE_SHIFT;
+ ASSERT(bioc);
- if (bio_flagged(bio, BIO_CLONED))
- bio->bi_iter = btrfs_bio(bio)->iter;
+ /* We rely on bio->bi_bdev to find the stripe number. */
+ if (!bio->bi_bdev)
+ goto not_found;
- bio_for_each_segment(bvec, bio, iter) {
- rbio->bio_pages[page_index + i] = bvec.bv_page;
- i++;
- }
+ for (i = 0; i < bioc->num_stripes; i++) {
+ if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
+ continue;
+ trace_info->stripe_nr = i;
+ trace_info->devid = bioc->stripes[i].dev->devid;
+ trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+ bioc->stripes[i].physical;
+ return;
}
- spin_unlock_irq(&rbio->bio_list_lock);
+
+not_found:
+ trace_info->devid = -1;
+ trace_info->offset = -1;
+ trace_info->stripe_nr = -1;
}
/*
@@ -1177,10 +1169,14 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
+ /* The total sector number inside the full stripe. */
+ int total_sector_nr;
int stripe;
- int pagenr;
+ /* Sector number inside a stripe. */
+ int sectornr;
bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
@@ -1195,6 +1191,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1224,86 +1223,108 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
- /* first collect one page from each data stripe */
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
+ /* First collect one sector from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap_local_page(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
- /* then add the parity stripe */
- p = rbio_pstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap_local_page(p);
+ /* Then add the parity stripe */
+ sector = rbio_pstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
if (has_qstripe) {
-
/*
- * raid6, add the qstripe and call the
- * library function to fill in our p/q
+ * RAID6, add the qstripe and call the library function
+ * to fill in our p/q
*/
- p = rbio_qstripe_page(rbio, pagenr);
- SetPageUptodate(p);
- pointers[stripe++] = kmap_local_page(p);
+ sector = rbio_qstripe_sector(rbio, sectornr);
+ sector->uptodate = 1;
+ pointers[stripe++] = kmap_local_page(sector->page) +
+ sector->pgoff;
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
for (stripe = stripe - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
}
/*
- * time to start writing. Make bios for everything from the
- * higher layers (the bio_list in our rbio) and our p/q. Ignore
- * everything else.
+ * Start writing. Make bios for everything from the higher layers (the
+ * bio_list in our rbio) and our P/Q. Ignore everything else.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
- if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
- continue;
- } else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
- }
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list,
- page, stripe, pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
if (likely(!bioc->num_tgtdevs))
goto write_data;
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- if (!bioc->tgtdev_map[stripe])
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+
+ stripe = total_sector_nr / rbio->stripe_nsectors;
+ sectornr = total_sector_nr % rbio->stripe_nsectors;
+
+ if (!bioc->tgtdev_map[stripe]) {
+ /*
+ * We can skip the whole stripe completely, note
+ * total_sector_nr will be increased by one anyway.
+ */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
continue;
+ }
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
- if (stripe < rbio->nr_data) {
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (!page)
- continue;
- } else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
- }
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- rbio->bioc->tgtdev_map[stripe],
- pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
+ if (stripe < rbio->nr_data) {
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (!sector)
+ continue;
+ } else {
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ rbio->bioc->tgtdev_map[stripe],
+ sectornr, REQ_OP_WRITE);
+ if (ret)
+ goto cleanup;
}
write_data:
@@ -1311,10 +1332,14 @@ write_data:
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
+ if (trace_raid56_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -1342,7 +1367,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bioc->num_stripes; i++) {
stripe = &rbio->bioc->stripes[i];
- if (in_range(physical, stripe->physical, rbio->stripe_len) &&
+ if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
@@ -1364,7 +1389,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->nr_data; i++) {
u64 stripe_start = rbio->bioc->raid_map[i];
- if (in_range(logical, stripe_start, rbio->stripe_len))
+ if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN))
return i;
}
return -1;
@@ -1417,56 +1442,89 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
}
/*
+ * For subpage case, we can no longer set page Uptodate directly for
+ * stripe_pages[], thus we need to locate the sector.
+ */
+static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
+ struct page *page,
+ unsigned int pgoff)
+{
+ int i;
+
+ for (i = 0; i < rbio->nr_sectors; i++) {
+ struct sector_ptr *sector = &rbio->stripe_sectors[i];
+
+ if (sector->page == page && sector->pgoff == pgoff)
+ return sector;
+ }
+ return NULL;
+}
+
+/*
* this sets each page in the bio uptodate. It should only be used on private
* rbio pages, nothing that comes in from the higher layers
*/
-static void set_bio_pages_uptodate(struct bio *bio)
+static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all)
- SetPageUptodate(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct sector_ptr *sector;
+ int pgoff;
+
+ for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
+ pgoff += sectorsize) {
+ sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
+ ASSERT(sector);
+ if (sector)
+ sector->uptodate = 1;
+ }
+ }
}
-/*
- * end io for the read phase of the rmw cycle. All the bios here are physical
- * stripe bios we've read from the disk so we can recalculate the parity of the
- * stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid_rmw_end_io(struct bio *bio)
+static void raid56_bio_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
- set_bio_pages_uptodate(bio);
+ set_bio_pages_uptodate(rbio, bio);
bio_put(bio);
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ if (atomic_dec_and_test(&rbio->stripes_pending))
+ queue_work(rbio->bioc->fs_info->endio_raid56_workers,
+ &rbio->end_io_work);
+}
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
- goto cleanup;
+/*
+ * End io handler for the read phase of the RMW cycle. All the bios here are
+ * physical stripe bios we've read from the disk so we can recalculate the
+ * parity of the stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid56_rmw_end_io_work(struct work_struct *work)
+{
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
+
+ if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
+ rbio_orig_end_io(rbio, BLK_STS_IOERR);
+ return;
+ }
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write but if there
+ * are any failed stripes we'll reconstruct from parity first.
*/
validate_rbio_for_rmw(rbio);
- return;
-
-cleanup:
-
- rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
@@ -1477,9 +1535,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
struct bio_list bio_list;
+ const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data;
int ret;
- int pagenr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -1491,36 +1549,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *page;
- /*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
- */
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
- continue;
+ /* Build a list of bios to read all the missing data sectors. */
+ for (total_sector_nr = 0; total_sector_nr < nr_data_sectors;
+ total_sector_nr++) {
+ struct sector_ptr *sector;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
- page = rbio_stripe_page(rbio, stripe, pagenr);
- /*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
- */
- if (PageUptodate(page))
- continue;
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a page
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate page. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
+ stripe, sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -1539,13 +1595,16 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
- bio->bi_end_io = raid_rmw_end_io;
- bio->bi_opf = REQ_OP_READ;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_read_partial_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_read_partial(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -1624,7 +1683,7 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct btrfs_work work;
+ struct work_struct work;
};
/*
@@ -1692,7 +1751,7 @@ static void run_plug(struct btrfs_plug_cb *plug)
* if the unplug comes from schedule, we have to push the
* work off to a helper thread
*/
-static void unplug_work(struct btrfs_work *work)
+static void unplug_work(struct work_struct *work)
{
struct btrfs_plug_cb *plug;
plug = container_of(work, struct btrfs_plug_cb, work);
@@ -1705,37 +1764,58 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
- btrfs_queue_work(plug->info->rmw_workers,
- &plug->work);
+ INIT_WORK(&plug->work, unplug_work);
+ queue_work(plug->info->rmw_workers, &plug->work);
return;
}
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bioc->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ fs_info->sectorsize_bits) % rbio->stripe_nsectors;
+
+ set_bit(bit, &rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len)
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
- int ret;
+ int ret = 0;
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- btrfs_put_bioc(bioc);
- return PTR_ERR(rbio);
+ ret = PTR_ERR(rbio);
+ goto fail;
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
-
- btrfs_bio_counter_inc_noblocked(fs_info);
- rbio->generic_bio_cnt = 1;
+ rbio_add_bio(rbio, bio);
/*
* don't plug on full rbios, just get them out the door
@@ -1744,8 +1824,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
if (ret)
- btrfs_bio_counter_dec(fs_info);
- return ret;
+ goto fail;
+ return;
}
cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
@@ -1756,13 +1836,17 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
- ret = 0;
} else {
ret = __raid56_parity_write(rbio);
if (ret)
- btrfs_bio_counter_dec(fs_info);
+ goto fail;
}
- return ret;
+
+ return;
+
+fail:
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
}
/*
@@ -1772,14 +1856,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
*/
static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
- int pagenr, stripe;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int sectornr, stripe;
void **pointers;
void **unmap_array;
int faila = -1, failb = -1;
- struct page *page;
blk_status_t err;
int i;
+ /*
+ * This array stores the pointer for each sector, thus it has the extra
+ * pgoff value added from each sector
+ */
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
err = BLK_STS_RESOURCE;
@@ -1808,43 +1896,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ struct sector_ptr *sector;
+
/*
* Now we just use bitmap to mark the horizontal stripes in
* which we have data when doing parity scrub.
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
- !test_bit(pagenr, rbio->dbitmap))
+ !test_bit(sectornr, &rbio->dbitmap))
continue;
/*
- * Setup our array of pointers with pages from each stripe
+ * Setup our array of pointers with sectors from each stripe
*
* NOTE: store a duplicate array of pointers to preserve the
* pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
- * if we're rebuilding a read, we have to use
+ * If we're rebuilding a read, we have to use
* pages from the bio list
*/
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
- page = page_in_rbio(rbio, stripe, pagenr, 0);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
} else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
}
- pointers[stripe] = kmap_local_page(page);
+ ASSERT(sector->page);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
unmap_array[stripe] = pointers[stripe];
}
- /* all raid6 handling here */
+ /* All raid6 handling here */
if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
- /*
- * single failure, rebuild from parity raid5
- * style
- */
+ /* Single failure, rebuild from parity raid5 style */
if (failb < 0) {
if (faila == rbio->nr_data) {
/*
@@ -1887,10 +1976,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
- PAGE_SIZE, faila, pointers);
+ sectorsize, faila, pointers);
} else {
raid6_2data_recov(rbio->real_stripes,
- PAGE_SIZE, faila, failb,
+ sectorsize, faila, failb,
pointers);
}
} else {
@@ -1900,7 +1989,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
BUG_ON(failb != -1);
pstripe:
/* Copy parity block into failed block to start with */
- copy_page(pointers[faila], pointers[rbio->nr_data]);
+ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
/* rearrange the pointer array */
p = pointers[faila];
@@ -1909,7 +1998,7 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
+ run_xor(pointers, rbio->nr_data - 1, sectorsize);
}
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
@@ -1918,14 +2007,14 @@ pstripe:
* other endio functions will fiddle the uptodate bits
*/
if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < rbio->stripe_npages; i++) {
+ for (i = 0; i < rbio->stripe_nsectors; i++) {
if (faila != -1) {
- page = rbio_stripe_page(rbio, faila, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, faila, i);
+ sector->uptodate = 1;
}
if (failb != -1) {
- page = rbio_stripe_page(rbio, failb, i);
- SetPageUptodate(page);
+ sector = rbio_stripe_sector(rbio, failb, i);
+ sector->uptodate = 1;
}
}
}
@@ -1984,25 +2073,13 @@ cleanup_io:
}
/*
- * This is called only for stripes we've read from disk to
- * reconstruct the parity.
+ * This is called only for stripes we've read from disk to reconstruct the
+ * parity.
*/
-static void raid_recover_end_io(struct bio *bio)
+static void raid_recover_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- /*
- * we only read stripe pages off the disk, set them
- * up to date if there were no errors
- */
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(bio);
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
@@ -2023,8 +2100,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2036,33 +2112,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
if (rbio->faila == stripe || rbio->failb == stripe) {
atomic_inc(&rbio->error);
+ /* Skip the current stripe. */
+ ASSERT(sectornr == 0);
+ total_sector_nr += rbio->stripe_nsectors - 1;
continue;
}
-
- for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
- continue;
-
- ret = rbio_add_io_page(rbio, &bio_list,
- rbio_stripe_page(rbio, stripe, pagenr),
- stripe, pagenr, rbio->stripe_len);
- if (ret < 0)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret < 0)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2085,13 +2159,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
- bio->bi_end_io = raid_recover_end_io;
- bio->bi_opf = REQ_OP_READ;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_recover_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
@@ -2114,28 +2191,20 @@ cleanup:
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
-int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len, int mirror_num, int generic_io)
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- int ret;
-
- if (generic_io) {
- ASSERT(bioc->mirror_num == mirror_num);
- btrfs_bio(bio)->mirror_num = mirror_num;
- }
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio)) {
- if (generic_io)
- btrfs_put_bioc(bioc);
- return PTR_ERR(rbio);
+ bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
+ goto out_end_bio;
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
@@ -2143,17 +2212,9 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bioc->map_type);
- if (generic_io)
- btrfs_put_bioc(bioc);
- kfree(rbio);
- return -EIO;
- }
-
- if (generic_io) {
- btrfs_bio_counter_inc_noblocked(fs_info);
- rbio->generic_bio_cnt = 1;
- } else {
- btrfs_get_bioc(bioc);
+ __free_raid_bio(rbio);
+ bio->bi_status = BLK_STS_IOERR;
+ goto out_end_bio;
}
/*
@@ -2173,27 +2234,21 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
rbio->failb--;
}
- ret = lock_stripe_add(rbio);
+ if (lock_stripe_add(rbio))
+ return;
/*
- * __raid56_parity_recover will end the bio with
- * any errors it hits. We don't want to return
- * its error value up the stack because our caller
- * will end up calling bio_endio with any nonzero
- * return
- */
- if (ret == 0)
- __raid56_parity_recover(rbio);
- /*
- * our rbio has been added to the list of
- * rbios that will be handled after the
- * currently lock owner is done
+ * This adds our rbio to the list of rbios that will be handled after
+ * the current lock owner is done.
*/
- return 0;
+ __raid56_parity_recover(rbio);
+ return;
+out_end_bio:
+ bio_endio(bio);
}
-static void rmw_work(struct btrfs_work *work)
+static void rmw_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2201,7 +2256,7 @@ static void rmw_work(struct btrfs_work *work)
raid56_rmw_stripe(rbio);
}
-static void read_rebuild_work(struct btrfs_work *work)
+static void read_rebuild_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2221,14 +2276,14 @@ static void read_rebuild_work(struct btrfs_work *work)
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
struct btrfs_io_context *bioc,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(fs_info, bioc, stripe_len);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2252,33 +2307,25 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
}
ASSERT(i < rbio->real_stripes);
- /* Now we just support the sectorsize equals to page size */
- ASSERT(fs_info->sectorsize == PAGE_SIZE);
- ASSERT(rbio->stripe_npages == stripe_nsectors);
- bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
-
- /*
- * We have already increased bio_counter when getting bioc, record it
- * so we can free it at rbio_orig_end_io().
- */
- rbio->generic_bio_cnt = 1;
-
+ bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
return rbio;
}
/* Used for both parity scrub and missing. */
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical)
+ unsigned int pgoff, u64 logical)
{
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
int stripe_offset;
int index;
ASSERT(logical >= rbio->bioc->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
- rbio->stripe_len * rbio->nr_data);
+ ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
+ BTRFS_STRIPE_LEN * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
- index = stripe_offset >> PAGE_SHIFT;
- rbio->bio_pages[index] = page;
+ index = stripe_offset / sectorsize;
+ rbio->bio_sectors[index].page = page;
+ rbio->bio_sectors[index].pgoff = pgoff;
}
/*
@@ -2287,23 +2334,25 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
*/
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
- int i;
- int bit;
- int index;
- struct page *page;
+ const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+ int total_sector_nr;
- for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
- for (i = 0; i < rbio->real_stripes; i++) {
- index = i * rbio->stripe_npages + bit;
- if (rbio->stripe_pages[index])
- continue;
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ struct page *page;
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
- page = alloc_page(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- rbio->stripe_pages[index] = page;
- }
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
+ if (rbio->stripe_pages[index])
+ continue;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ rbio->stripe_pages[index] = page;
}
+ index_stripe_sectors(rbio);
return 0;
}
@@ -2311,14 +2360,15 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
struct btrfs_io_context *bioc = rbio->bioc;
+ const u32 sectorsize = bioc->fs_info->sectorsize;
void **pointers = rbio->finish_pointers;
- unsigned long *pbitmap = rbio->finish_pbitmap;
+ unsigned long *pbitmap = &rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
- int pagenr;
+ int sectornr;
bool has_qstripe;
- struct page *p_page = NULL;
- struct page *q_page = NULL;
+ struct sector_ptr p_sector = { 0 };
+ struct sector_ptr q_sector = { 0 };
struct bio_list bio_list;
struct bio *bio;
int is_replace = 0;
@@ -2335,7 +2385,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
- bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
+ bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
}
/*
@@ -2348,54 +2398,59 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
if (!need_check)
goto writeback;
- p_page = alloc_page(GFP_NOFS);
- if (!p_page)
+ p_sector.page = alloc_page(GFP_NOFS);
+ if (!p_sector.page)
goto cleanup;
- SetPageUptodate(p_page);
+ p_sector.pgoff = 0;
+ p_sector.uptodate = 1;
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_page = alloc_page(GFP_NOFS);
- if (!q_page) {
- __free_page(p_page);
+ q_sector.page = alloc_page(GFP_NOFS);
+ if (!q_sector.page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
goto cleanup;
}
- SetPageUptodate(q_page);
- pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
+ q_sector.pgoff = 0;
+ q_sector.uptodate = 1;
+ pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
}
atomic_set(&rbio->error, 0);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap_local_page(p_page);
+ pointers[nr_data] = kmap_local_page(p_sector.page);
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *p;
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
void *parity;
+
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
- p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap_local_page(p);
+ sector = sector_in_rbio(rbio, stripe, sectornr, 0);
+ pointers[stripe] = kmap_local_page(sector->page) +
+ sector->pgoff;
}
if (has_qstripe) {
/* RAID6, call the library function to fill in our P/Q */
- raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
+ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
pointers);
} else {
/* raid5 */
- copy_page(pointers[nr_data], pointers[0]);
- run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
+ memcpy(pointers[nr_data], pointers[0], sectorsize);
+ run_xor(pointers + 1, nr_data - 1, sectorsize);
}
/* Check scrubbing parity and repair it */
- p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- parity = kmap_local_page(p);
- if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
- copy_page(parity, pointers[rbio->scrubp]);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ parity = kmap_local_page(sector->page) + sector->pgoff;
+ if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
+ memcpy(parity, pointers[rbio->scrubp], sectorsize);
else
/* Parity is right, needn't writeback */
- bitmap_clear(rbio->dbitmap, pagenr, 1);
+ bitmap_clear(&rbio->dbitmap, sectornr, 1);
kunmap_local(parity);
for (stripe = nr_data - 1; stripe >= 0; stripe--)
@@ -2403,10 +2458,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
}
kunmap_local(pointers[nr_data]);
- __free_page(p_page);
- if (q_page) {
+ __free_page(p_sector.page);
+ p_sector.page = NULL;
+ if (q_sector.page) {
kunmap_local(pointers[rbio->real_stripes - 1]);
- __free_page(q_page);
+ __free_page(q_sector.page);
+ q_sector.page = NULL;
}
writeback:
@@ -2415,12 +2472,12 @@ writeback:
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list,
- page, rbio->scrubp, pagenr, rbio->stripe_len);
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2428,13 +2485,13 @@ writeback:
if (!is_replace)
goto submit_write;
- for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
- struct page *page;
+ for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
+ struct sector_ptr *sector;
- page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- ret = rbio_add_io_page(rbio, &bio_list, page,
+ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ ret = rbio_add_io_sector(rbio, &bio_list, sector,
bioc->tgtdev_map[rbio->scrubp],
- pagenr, rbio->stripe_len);
+ sectornr, REQ_OP_WRITE);
if (ret)
goto cleanup;
}
@@ -2450,10 +2507,14 @@ submit_write:
atomic_set(&rbio->stripes_pending, nr_data);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- bio->bi_opf = REQ_OP_WRITE;
+ if (trace_raid56_scrub_write_stripe_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_write_stripe(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
return;
@@ -2541,24 +2602,14 @@ cleanup:
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid56_parity_scrub_end_io(struct bio *bio)
+static void raid56_parity_scrub_end_io_work(struct work_struct *work)
{
- struct btrfs_raid_bio *rbio = bio->bi_private;
-
- if (bio->bi_status)
- fail_bio_stripe(rbio, bio);
- else
- set_bio_pages_uptodate(bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
+ struct btrfs_raid_bio *rbio =
+ container_of(work, struct btrfs_raid_bio, end_io_work);
/*
- * this will normally call finish_rmw to start our write
- * but if there are any failed stripes we'll reconstruct
- * from parity first
+ * This will normally call finish_rmw to start our write, but if there
+ * are any failed stripes we'll reconstruct from parity first
*/
validate_rbio_for_parity_scrub(rbio);
}
@@ -2568,8 +2619,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int pagenr;
- int stripe;
+ int total_sector_nr;
struct bio *bio;
bio_list_init(&bio_list);
@@ -2579,36 +2629,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
goto cleanup;
atomic_set(&rbio->error, 0);
- /*
- * build a list of bios to read all the missing parts of this
- * stripe
- */
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
- struct page *page;
- /*
- * we want to find all the pages missing from
- * the rbio and read them from the disk. If
- * page_in_rbio finds a page in the bio list
- * we don't need to read it off the stripe.
- */
- page = page_in_rbio(rbio, stripe, pagenr, 1);
- if (page)
- continue;
+ /* Build a list of bios to read all the missing parts. */
+ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ total_sector_nr++) {
+ int sectornr = total_sector_nr % rbio->stripe_nsectors;
+ int stripe = total_sector_nr / rbio->stripe_nsectors;
+ struct sector_ptr *sector;
+
+ /* No data in the vertical stripe, no need to read. */
+ if (!test_bit(sectornr, &rbio->dbitmap))
+ continue;
- page = rbio_stripe_page(rbio, stripe, pagenr);
- /*
- * the bio cache may have handed us an uptodate
- * page. If so, be happy and use it
- */
- if (PageUptodate(page))
- continue;
+ /*
+ * We want to find all the sectors missing from the rbio and
+ * read them from the disk. If sector_in_rbio() finds a sector
+ * in the bio list we don't need to read it off the stripe.
+ */
+ sector = sector_in_rbio(rbio, stripe, sectornr, 1);
+ if (sector)
+ continue;
- ret = rbio_add_io_page(rbio, &bio_list, page,
- stripe, pagenr, rbio->stripe_len);
- if (ret)
- goto cleanup;
- }
+ sector = rbio_stripe_sector(rbio, stripe, sectornr);
+ /*
+ * The bio cache may have handed us an uptodate sector. If so,
+ * use it.
+ */
+ if (sector->uptodate)
+ continue;
+
+ ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
+ sectornr, REQ_OP_READ);
+ if (ret)
+ goto cleanup;
}
bios_to_read = bio_list_size(&bio_list);
@@ -2627,13 +2679,16 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
* touch it after that.
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
+ INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work);
while ((bio = bio_list_pop(&bio_list))) {
- bio->bi_private = rbio;
- bio->bi_end_io = raid56_parity_scrub_end_io;
- bio->bi_opf = REQ_OP_READ;
+ bio->bi_end_io = raid56_bio_end_io;
- btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+ if (trace_raid56_scrub_read_enabled()) {
+ struct raid56_bio_trace_info trace_info = { 0 };
+ bio_get_trace_info(rbio, bio, &trace_info);
+ trace_raid56_scrub_read(rbio, bio, &trace_info);
+ }
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
@@ -2651,7 +2706,7 @@ finish:
validate_rbio_for_parity_scrub(rbio);
}
-static void scrub_parity_work(struct btrfs_work *work)
+static void scrub_parity_work(struct work_struct *work)
{
struct btrfs_raid_bio *rbio;
@@ -2668,13 +2723,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
- u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = bioc->fs_info;
struct btrfs_raid_bio *rbio;
- rbio = alloc_rbio(fs_info, bioc, length);
+ rbio = alloc_rbio(fs_info, bioc);
if (IS_ERR(rbio))
return NULL;
@@ -2693,12 +2747,6 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
return NULL;
}
- /*
- * When we get bioc, we have already increased bio_counter, record it
- * so we can free it at rbio_orig_end_io()
- */
- rbio->generic_bio_cnt = 1;
-
return rbio;
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 72c00fc284b5..91d5c0adad15 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,46 +7,177 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
-static inline int nr_parity_stripes(const struct map_lookup *map)
-{
- if (map->type & BTRFS_BLOCK_GROUP_RAID5)
- return 1;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- return 2;
- else
- return 0;
-}
+#include <linux/workqueue.h>
+#include "volumes.h"
+
+enum btrfs_rbio_ops {
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
+};
+
+struct btrfs_raid_bio {
+ struct btrfs_io_context *bioc;
+
+ /*
+ * While we're doing RMW on a stripe we put it into a hash table so we
+ * can lock the stripe and merge more rbios into it.
+ */
+ struct list_head hash_list;
+
+ /* LRU list for the stripe cache */
+ struct list_head stripe_cache;
+
+ /* For scheduling work in the helper threads */
+ struct work_struct work;
+
+ /*
+ * bio_list and bio_list_lock are used to add more bios into the stripe
+ * in hopes of avoiding the full RMW
+ */
+ struct bio_list bio_list;
+ spinlock_t bio_list_lock;
+
+ /*
+ * Also protected by the bio_list_lock, the plug list is used by the
+ * plugging code to collect partial bios while plugged. The stripe
+ * locking code also uses it to hand off the stripe lock to the next
+ * pending IO.
+ */
+ struct list_head plug_list;
+
+ /* Flags that tell us if it is safe to merge with this bio. */
+ unsigned long flags;
+
+ /*
+ * Set if we're doing a parity rebuild for a read from higher up, which
+ * is handled differently from a parity rebuild as part of RMW.
+ */
+ enum btrfs_rbio_ops operation;
+
+ /* How many pages there are for the full stripe including P/Q */
+ u16 nr_pages;
+
+ /* How many sectors there are for the full stripe including P/Q */
+ u16 nr_sectors;
+
+ /* Number of data stripes (no p/q) */
+ u8 nr_data;
+
+ /* Numer of all stripes (including P/Q) */
+ u8 real_stripes;
+
+ /* How many pages there are for each stripe */
+ u8 stripe_npages;
+
+ /* How many sectors there are for each stripe */
+ u8 stripe_nsectors;
+
+ /* First bad stripe, -1 means no corruption */
+ s8 faila;
+
+ /* Second bad stripe (for RAID6 use) */
+ s8 failb;
+
+ /* Stripe number that we're scrubbing */
+ u8 scrubp;
+
+ /*
+ * Size of all the bios in the bio_list. This helps us decide if the
+ * rbio maps to a full stripe or not.
+ */
+ int bio_list_bytes;
+
+ refcount_t refs;
+
+ atomic_t stripes_pending;
+
+ atomic_t error;
+
+ struct work_struct end_io_work;
+
+ /* Bitmap to record which horizontal stripe has data */
+ unsigned long dbitmap;
+
+ /* Allocated with stripe_nsectors-many bits for finish_*() calls */
+ unsigned long finish_pbitmap;
+
+ /*
+ * These are two arrays of pointers. We allocate the rbio big enough
+ * to hold them both and setup their locations when the rbio is
+ * allocated.
+ */
+
+ /*
+ * Pointers to pages that we allocated for reading/writing stripes
+ * directly from the disk (including P/Q).
+ */
+ struct page **stripe_pages;
+
+ /* Pointers to the sectors in the bio_list, for faster lookup */
+ struct sector_ptr *bio_sectors;
+
+ /*
+ * For subpage support, we need to map each sector to above
+ * stripe_pages.
+ */
+ struct sector_ptr *stripe_sectors;
+
+ /* Allocated with real_stripes-many pointers for finish_*() calls */
+ void **finish_pointers;
+};
+
+/*
+ * For trace event usage only. Records useful debug info for each bio submitted
+ * by RAID56 to each physical device.
+ *
+ * No matter signed or not, (-1) is always the one indicating we can not grab
+ * the proper stripe number.
+ */
+struct raid56_bio_trace_info {
+ u64 devid;
+
+ /* The offset inside the stripe. (<= STRIPE_LEN) */
+ u32 offset;
+
+ /*
+ * Stripe number.
+ * 0 is the first data stripe, and nr_data for P stripe,
+ * nr_data + 1 for Q stripe.
+ * >= real_stripes for
+ */
+ u8 stripe_nr;
+};
static inline int nr_data_stripes(const struct map_lookup *map)
{
- return map->num_stripes - nr_parity_stripes(map);
+ return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
+
#define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1)
#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
((x) == RAID6_Q_STRIPE))
-struct btrfs_raid_bio;
struct btrfs_device;
-int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len, int mirror_num, int generic_io);
-int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
- u64 stripe_len);
+void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ int mirror_num);
+void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
- u64 logical);
+ unsigned int pgoff, u64 logical);
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
- struct btrfs_io_context *bioc, u64 stripe_len,
+ struct btrfs_io_context *bioc,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
- u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 998e3f180d90..f50586ff85c8 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -5,6 +5,7 @@
#include "compression.h"
#include "ctree.h"
#include "delalloc-space.h"
+#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
#include "subpage.h"
@@ -22,8 +23,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
int ret;
inode_inc_iversion(inode);
- if (!no_time_update)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ if (!no_time_update) {
+ inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -89,7 +92,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, NULL);
+ NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -110,7 +113,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (comp_type == BTRFS_COMPRESS_NONE) {
memcpy_to_page(page, offset_in_page(file_offset), data_start,
datal);
- flush_dcache_page(page);
} else {
ret = btrfs_decompress(comp_type, data_start, page,
offset_in_page(file_offset),
@@ -132,10 +134,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
*
* So what's in the range [500, 4095] corresponds to zeroes.
*/
- if (datal < block_size) {
+ if (datal < block_size)
memzero_page(page, datal, block_size - datal);
- flush_dcache_page(page);
- }
btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
@@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
+ u64 prev_extent_end = off;
ret = -ENOMEM;
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
@@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
- u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
u64 extent_gen;
int type;
@@ -431,14 +431,21 @@ process_slot:
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
+ *
+ * Subsequent searches may leave us on a file range we have
+ * processed before - this happens due to a race with ordered
+ * extent completion for a file range that is outside our source
+ * range, but that range was part of a file extent item that
+ * also covered a leading part of our source range.
*/
- if (key.offset + datal <= off) {
+ if (key.offset + datal <= prev_extent_end) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
- next_key_min_offset = key.offset + datal;
+
+ prev_extent_end = key.offset + datal;
size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
@@ -489,6 +496,7 @@ process_slot:
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
+ clone_info.update_times = !no_time_update;
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
drop_start, new_key.offset + datal - 1,
&clone_info, &trans);
@@ -550,7 +558,7 @@ process_slot:
break;
btrfs_release_path(path);
- key.offset = next_key_min_offset;
+ key.offset = prev_extent_end;
if (fatal_signal_pending(current)) {
ret = -EINTR;
@@ -607,21 +615,30 @@ out:
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
}
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
+ u64 range1_end = loff1 + len - 1;
+ u64 range2_end = loff2 + len - 1;
+
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
+ swap(range1_end, range2_end);
} else if (inode1 == inode2 && loff2 < loff1) {
swap(loff1, loff2);
+ swap(range1_end, range2_end);
}
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
+
+ btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
+ btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
}
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
@@ -641,7 +658,8 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
- const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ const u64 bs = fs_info->sb->s_blocksize;
int ret;
/*
@@ -652,6 +670,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ btrfs_btree_balance_dirty(fs_info);
+
return ret;
}
@@ -761,6 +781,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
+ btrfs_btree_balance_dirty(fs_info);
+
return ret;
}
@@ -771,7 +793,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
- bool same_inode = inode_out == inode_in;
u64 wb_len;
int ret;
@@ -810,15 +831,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
wb_len = ALIGN(*len, bs);
/*
- * Since we don't lock ranges, wait for ongoing lockless dio writes (as
- * any in progress could create its ordered extents after we wait for
- * existing ordered extents below).
- */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- /*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*
* Btrfs' back references do not have a block level granularity, they
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fdc2c4b411f0..666a37a0ee89 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
- root = (struct btrfs_root *)node->data;
+ root = node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
return btrfs_grab_root(root);
@@ -1101,7 +1101,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
continue;
/*
- * if we are modifying block in fs tree, wait for readpage
+ * if we are modifying block in fs tree, wait for read_folio
* to complete and drop the extent cache
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
@@ -1124,10 +1124,10 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (!ret)
continue;
- btrfs_drop_extent_cache(BTRFS_I(inode),
- key.offset, end, 1);
+ btrfs_drop_extent_map_range(BTRFS_I(inode),
+ key.offset, end, true);
unlock_extent(&BTRFS_I(inode)->io_tree,
- key.offset, end);
+ key.offset, end, NULL);
}
}
@@ -1326,7 +1326,9 @@ again:
btrfs_release_path(path);
path->lowest_level = level;
+ set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+ clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
path->lowest_level = 0;
if (ret) {
if (ret > 0)
@@ -1563,10 +1565,10 @@ static int invalidate_extent_cache(struct btrfs_root *root,
end = (u64)-1;
}
- /* the lock_extent waits for readpage to complete */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ /* the lock_extent waits for read_folio to complete */
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
}
return 0;
}
@@ -2818,7 +2820,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* Subpage can't handle page with DIRTY but without UPTODATE
* bit as it can lead to the following deadlock:
*
- * btrfs_readpage()
+ * btrfs_read_folio()
* | Page already *locked*
* |- btrfs_lock_and_flush_ordered_range()
* |- btrfs_start_ordered_extent()
@@ -2867,13 +2869,13 @@ static noinline_for_stack int prealloc_file_extent_cluster(
else
end = cluster->end - offset;
- lock_extent(&inode->io_tree, start, end);
+ lock_extent(&inode->io_tree, start, end, NULL);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&inode->io_tree, start, end);
+ unlock_extent(&inode->io_tree, start, end, NULL);
if (ret)
break;
}
@@ -2888,7 +2890,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
u64 start, u64 end, u64 block_start)
{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
@@ -2902,18 +2903,11 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->block_start = block_start;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- lock_extent(&BTRFS_I(inode)->io_tree, start, end);
- while (1) {
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
- }
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+ free_extent_map(em);
+
return ret;
}
@@ -2967,11 +2961,12 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
goto release_page;
if (PageReadahead(page))
- page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
- page_index, last_index + 1 - page_index);
+ page_cache_async_readahead(inode->i_mapping, ra, NULL,
+ page_folio(page), page_index,
+ last_index + 1 - page_index);
if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (!PageUptodate(page)) {
ret = -EIO;
@@ -2997,12 +2992,13 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- clamped_len, clamped_len);
+ clamped_len, clamped_len,
+ false);
if (ret)
goto release_page;
/* Mark the range delalloc and dirty for later writeback */
- lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
clamped_end, 0, NULL);
if (ret) {
@@ -3035,7 +3031,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
boundary_start, boundary_end,
EXTENT_BOUNDARY);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+ unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@@ -3571,7 +3567,12 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- return btrfs_commit_transaction(trans);
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ unset_reloc_control(rc);
+
+ return ret;
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@@ -3845,8 +3846,7 @@ out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
- if (inode)
- iput(inode);
+ iput(inode);
inode = ERR_PTR(err);
}
return inode;
@@ -3977,6 +3977,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (!bg)
return -ENOENT;
+ /*
+ * Relocation of a data block group creates ordered extents. Without
+ * sb_start_write(), we can freeze the filesystem while unfinished
+ * ordered extents are left. Such ordered extents can cause a deadlock
+ * e.g. when syncfs() is waiting for their completion but they can't
+ * finish because they block when joining a transaction, due to the
+ * fact that the freeze locks are being held in write mode.
+ */
+ if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
+ ASSERT(sb_write_started(fs_info->sb));
+
if (btrfs_pinned_by_swapfile(fs_info, bg)) {
btrfs_put_block_group(bg);
return -ETXTBSY;
@@ -4320,7 +4331,7 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + inode->index_cnt;
csum_root = btrfs_csum_root(fs_info, disk_bytenr);
ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list, 0);
+ disk_bytenr + len - 1, &list, 0, false);
if (ret)
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ca7426ef61c8..e1f599d7a916 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -337,7 +337,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
- int err = 0;
int ret;
path = btrfs_alloc_path();
@@ -349,9 +348,9 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
goto out;
- if (ret == 0) {
+ } else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
@@ -359,18 +358,18 @@ again:
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name_len) ||
memcmp_extent_buffer(leaf, name, ptr, name_len)) {
- err = -ENOENT;
+ ret = -ENOENT;
goto out;
}
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
- } else
- err = -ENOENT;
+ } else {
+ ret = -ENOENT;
+ goto out;
+ }
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
@@ -382,7 +381,7 @@ again:
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
/*
@@ -509,7 +508,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
- qgroup_num_bytes, true);
+ qgroup_num_bytes, true,
+ false);
if (ret)
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 11089568b287..f260c53829e5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -45,14 +45,16 @@ struct scrub_ctx;
* operations. The first one configures an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
-#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */
-#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */
+#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
+#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
/*
* The following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
*/
-#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+
+#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
struct scrub_recover {
refcount_t refs;
@@ -60,18 +62,14 @@ struct scrub_recover {
u64 map_length;
};
-struct scrub_page {
+struct scrub_sector {
struct scrub_block *sblock;
- struct page *page;
- struct btrfs_device *dev;
struct list_head list;
u64 flags; /* extent flags */
u64 generation;
- u64 logical;
- u64 physical;
- u64 physical_for_dev_replace;
+ /* Offset in bytes to @sblock. */
+ u32 offset;
atomic_t refs;
- u8 mirror_num;
unsigned int have_csum:1;
unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
@@ -87,16 +85,30 @@ struct scrub_bio {
blk_status_t status;
u64 logical;
u64 physical;
- struct scrub_page *pagev[SCRUB_PAGES_PER_BIO];
- int page_count;
+ struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
+ int sector_count;
int next_free;
- struct btrfs_work work;
+ struct work_struct work;
};
struct scrub_block {
- struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
- int page_count;
- atomic_t outstanding_pages;
+ /*
+ * Each page will have its page::private used to record the logical
+ * bytenr.
+ */
+ struct page *pages[SCRUB_MAX_PAGES];
+ struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+ struct btrfs_device *dev;
+ /* Logical bytenr of the sblock */
+ u64 logical;
+ u64 physical;
+ u64 physical_for_dev_replace;
+ /* Length of sblock in bytes */
+ u32 len;
+ int sector_count;
+ int mirror_num;
+
+ atomic_t outstanding_sectors;
refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
@@ -110,7 +122,7 @@ struct scrub_block {
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
- struct btrfs_work work;
+ struct work_struct work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
@@ -129,21 +141,19 @@ struct scrub_parity {
refcount_t refs;
- struct list_head spages;
+ struct list_head sectors_list;
/* Work of parity check and repair */
- struct btrfs_work work;
+ struct work_struct work;
/* Mark the parity blocks which have data */
- unsigned long *dbitmap;
+ unsigned long dbitmap;
/*
* Mark the parity blocks which have data, but errors happen when
* read data or check data
*/
- unsigned long *ebitmap;
-
- unsigned long bitmap[];
+ unsigned long ebitmap;
};
struct scrub_ctx {
@@ -158,7 +168,7 @@ struct scrub_ctx {
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
- int pages_per_bio;
+ int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -204,51 +214,217 @@ struct full_stripe_lock {
struct mutex mutex;
};
+#ifndef CONFIG_64BIT
+/* This structure is for archtectures whose (void *) is smaller than u64 */
+struct scrub_page_private {
+ u64 logical;
+};
+#endif
+
+static int attach_scrub_page_private(struct page *page, u64 logical)
+{
+#ifdef CONFIG_64BIT
+ attach_page_private(page, (void *)logical);
+ return 0;
+#else
+ struct scrub_page_private *spp;
+
+ spp = kmalloc(sizeof(*spp), GFP_KERNEL);
+ if (!spp)
+ return -ENOMEM;
+ spp->logical = logical;
+ attach_page_private(page, (void *)spp);
+ return 0;
+#endif
+}
+
+static void detach_scrub_page_private(struct page *page)
+{
+#ifdef CONFIG_64BIT
+ detach_page_private(page);
+ return;
+#else
+ struct scrub_page_private *spp;
+
+ spp = detach_page_private(page);
+ kfree(spp);
+ return;
+#endif
+}
+
+static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
+ struct btrfs_device *dev,
+ u64 logical, u64 physical,
+ u64 physical_for_dev_replace,
+ int mirror_num)
+{
+ struct scrub_block *sblock;
+
+ sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ if (!sblock)
+ return NULL;
+ refcount_set(&sblock->refs, 1);
+ sblock->sctx = sctx;
+ sblock->logical = logical;
+ sblock->physical = physical;
+ sblock->physical_for_dev_replace = physical_for_dev_replace;
+ sblock->dev = dev;
+ sblock->mirror_num = mirror_num;
+ sblock->no_io_error_seen = 1;
+ /*
+ * Scrub_block::pages will be allocated at alloc_scrub_sector() when
+ * the corresponding page is not allocated.
+ */
+ return sblock;
+}
+
+/*
+ * Allocate a new scrub sector and attach it to @sblock.
+ *
+ * Will also allocate new pages for @sblock if needed.
+ */
+static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
+ u64 logical, gfp_t gfp)
+{
+ const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
+ struct scrub_sector *ssector;
+
+ /* We must never have scrub_block exceed U32_MAX in size. */
+ ASSERT(logical - sblock->logical < U32_MAX);
+
+ ssector = kzalloc(sizeof(*ssector), gfp);
+ if (!ssector)
+ return NULL;
+
+ /* Allocate a new page if the slot is not allocated */
+ if (!sblock->pages[page_index]) {
+ int ret;
+
+ sblock->pages[page_index] = alloc_page(gfp);
+ if (!sblock->pages[page_index]) {
+ kfree(ssector);
+ return NULL;
+ }
+ ret = attach_scrub_page_private(sblock->pages[page_index],
+ sblock->logical + (page_index << PAGE_SHIFT));
+ if (ret < 0) {
+ kfree(ssector);
+ __free_page(sblock->pages[page_index]);
+ sblock->pages[page_index] = NULL;
+ return NULL;
+ }
+ }
+
+ atomic_set(&ssector->refs, 1);
+ ssector->sblock = sblock;
+ /* The sector to be added should not be used */
+ ASSERT(sblock->sectors[sblock->sector_count] == NULL);
+ ssector->offset = logical - sblock->logical;
+
+ /* The sector count must be smaller than the limit */
+ ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
+
+ sblock->sectors[sblock->sector_count] = ssector;
+ sblock->sector_count++;
+ sblock->len += sblock->sctx->fs_info->sectorsize;
+
+ return ssector;
+}
+
+static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+ pgoff_t index;
+ /*
+ * When calling this function, ssector must be alreaday attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ index = ssector->offset >> PAGE_SHIFT;
+ ASSERT(index < SCRUB_MAX_PAGES);
+ ASSERT(sblock->pages[index]);
+ ASSERT(PagePrivate(sblock->pages[index]));
+ return sblock->pages[index];
+}
+
+static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+{
+ struct scrub_block *sblock = ssector->sblock;
+
+ /*
+ * When calling this function, ssector must be already attached to the
+ * parent sblock.
+ */
+ ASSERT(sblock);
+
+ /* The range should be inside the sblock range */
+ ASSERT(ssector->offset < sblock->len);
+
+ return offset_in_page(ssector->offset);
+}
+
+static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
+{
+ return page_address(scrub_sector_get_page(ssector)) +
+ scrub_sector_get_page_offset(ssector);
+}
+
+static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
+ unsigned int len)
+{
+ return bio_add_page(bio, scrub_sector_get_page(ssector), len,
+ scrub_sector_get_page_offset(ssector));
+}
+
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck);
+ struct scrub_block *sblocks_for_recheck[]);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
- int page_num, int force_write);
+ int sector_num, int force_write);
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num);
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
+ int sector_num);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
-static void scrub_page_get(struct scrub_page *spage);
-static void scrub_page_put(struct scrub_page *spage);
+static void scrub_sector_get(struct scrub_sector *sector);
+static void scrub_sector_put(struct scrub_sector *sector);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
- u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum,
- u64 physical_for_dev_replace);
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+ u64 physical, struct btrfs_device *dev, u64 flags,
+ u64 gen, int mirror_num, u8 *csum,
+ u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio);
-static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_bio_end_io_worker(struct work_struct *work);
static void scrub_block_complete(struct scrub_block *sblock);
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num);
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage);
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num);
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector);
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_wr_bio_end_io_worker(struct work_struct *work);
static void scrub_put_ctx(struct scrub_ctx *sctx);
-static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
+static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
{
- return spage->recover &&
- (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ return sector->recover &&
+ (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -535,10 +711,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
- for (i = 0; i < sbio->page_count; i++) {
- WARN_ON(!sbio->pagev[i]->page);
- scrub_block_put(sbio->pagev[i]->sblock);
- }
+ for (i = 0; i < sbio->sector_count; i++)
+ scrub_block_put(sbio->sectors[i]->sblock);
bio_put(sbio->bio);
}
@@ -572,7 +746,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
+ sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
@@ -586,9 +760,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sbio->index = i;
sbio->sctx = sctx;
- sbio->page_count = 0;
- btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
- NULL);
+ sbio->sector_count = 0;
+ INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -728,16 +901,23 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u8 ref_level = 0;
int ret;
- WARN_ON(sblock->page_count < 1);
- dev = sblock->pagev[0]->dev;
+ WARN_ON(sblock->sector_count < 1);
+ dev = sblock->dev;
fs_info = sblock->sctx->fs_info;
+ /* Super block error, no need to search extent tree. */
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ errstr, rcu_str_deref(dev->name),
+ sblock->physical);
+ return;
+ }
path = btrfs_alloc_path();
if (!path)
return;
- swarn.physical = sblock->pagev[0]->physical;
- swarn.logical = sblock->pagev[0]->logical;
+ swarn.physical = sblock->physical;
+ swarn.logical = sblock->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
@@ -798,8 +978,8 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
/*
* scrub_handle_errored_block gets called when either verification of the
- * pages failed or the bio failed to read, e.g. with EIO. In the latter
- * case, this function handles all pages in the bio, even though only one
+ * sectors failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all sectors in the bio, even though only one
* may be bad.
* The goal of this function is to repair the errored block by using the
* contents of one of the mirrors.
@@ -807,43 +987,45 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
struct scrub_ctx *sctx = sblock_to_check->sctx;
- struct btrfs_device *dev;
+ struct btrfs_device *dev = sblock_to_check->dev;
struct btrfs_fs_info *fs_info;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ /* One scrub_block for each mirror */
+ struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
- int page_num;
+ int sector_num;
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- BUG_ON(sblock_to_check->page_count < 1);
+ BUG_ON(sblock_to_check->sector_count < 1);
fs_info = sctx->fs_info;
- if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+ if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
- * if we find an error in a super block, we just report it.
+ * If we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
+ scrub_print_warning("super block error", sblock_to_check);
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
return 0;
}
- logical = sblock_to_check->pagev[0]->logical;
- BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
- failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
- is_metadata = !(sblock_to_check->pagev[0]->flags &
+ logical = sblock_to_check->logical;
+ ASSERT(sblock_to_check->mirror_num);
+ failed_mirror_index = sblock_to_check->mirror_num - 1;
+ is_metadata = !(sblock_to_check->sectors[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock_to_check->pagev[0]->have_csum;
- dev = sblock_to_check->pagev[0]->dev;
+ have_csum = sblock_to_check->sectors[0]->have_csum;
if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
return 0;
@@ -854,7 +1036,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* might be waiting the scrub task to pause (which needs to wait for all
* the worker tasks to complete before pausing).
* We do allocations in the workers through insert_full_stripe_lock()
- * and scrub_add_page_to_wr_bio(), which happens down the call chain of
+ * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
* this function.
*/
nofs_flag = memalloc_nofs_save();
@@ -905,20 +1087,31 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* repaired area is verified in order to correctly maintain
* the statistics.
*/
-
- sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
- sizeof(*sblocks_for_recheck), GFP_KERNEL);
- if (!sblocks_for_recheck) {
- spin_lock(&sctx->stat_lock);
- sctx->stat.malloc_errors++;
- sctx->stat.read_errors++;
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
- goto out;
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ /*
+ * Note: the two members refs and outstanding_sectors are not
+ * used in the blocks that are used for the recheck procedure.
+ *
+ * But alloc_scrub_block() will initialize sblock::ref anyway,
+ * so we can use scrub_block_put() to clean them up.
+ *
+ * And here we don't setup the physical/dev for the sblock yet,
+ * they will be correctly initialized in scrub_setup_recheck_block().
+ */
+ sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
+ logical, 0, 0, mirror_index);
+ if (!sblocks_for_recheck[mirror_index]) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ sctx->stat.read_errors++;
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ goto out;
+ }
}
- /* setup the context, map the logical blocks and alloc the pages */
+ /* Setup the context, map the logical blocks and alloc the sectors */
ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
@@ -929,7 +1122,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
- sblock_bad = sblocks_for_recheck + failed_mirror_index;
+ sblock_bad = sblocks_for_recheck[failed_mirror_index];
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, 1);
@@ -937,7 +1130,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
/*
- * the error disappeared after reading page by page, or
+ * The error disappeared after reading sector by sector, or
* the area was part of a huge bio and other parts of the
* bio caused I/O errors, or the block layer merged several
* read requests into one and the error is caused by a
@@ -998,10 +1191,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* that is known to contain an error is rewritten. Afterwards
* the block is known to be corrected.
* If a mirror is found which is completely correct, and no
- * checksum is present, only those pages are rewritten that had
+ * checksum is present, only those sectors are rewritten that had
* an I/O error in the block to be repaired, since it cannot be
- * determined, which copy of the other pages is better (and it
- * could happen otherwise that a correct page would be
+ * determined, which copy of the other sectors is better (and it
+ * could happen otherwise that a correct sector would be
* overwritten by a bad one).
*/
for (mirror_index = 0; ;mirror_index++) {
@@ -1011,25 +1204,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
continue;
/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
- if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
- if (!sblocks_for_recheck[mirror_index].page_count)
+ if (!sblocks_for_recheck[mirror_index]->sector_count)
break;
- sblock_other = sblocks_for_recheck + mirror_index;
+ sblock_other = sblocks_for_recheck[mirror_index];
} else {
- struct scrub_recover *r = sblock_bad->pagev[0]->recover;
+ struct scrub_recover *r = sblock_bad->sectors[0]->recover;
int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
- if (!sblocks_for_recheck[1].page_count)
+ if (!sblocks_for_recheck[1]->sector_count)
break;
ASSERT(failed_mirror_index == 0);
- sblock_other = sblocks_for_recheck + 1;
- sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
+ sblock_other = sblocks_for_recheck[1];
+ sblock_other->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
@@ -1078,16 +1271,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* area are unreadable.
*/
success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count;
- page_num++) {
- struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
+ for (sector_num = 0; sector_num < sblock_bad->sector_count;
+ sector_num++) {
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
struct scrub_block *sblock_other = NULL;
- /* skip no-io-error page in scrub */
- if (!spage_bad->io_error && !sctx->is_dev_replace)
+ /* Skip no-io-error sectors in scrub */
+ if (!sector_bad->io_error && !sctx->is_dev_replace)
continue;
- if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
/*
* In case of dev replace, if raid56 rebuild process
* didn't work out correct data, then copy the content
@@ -1096,16 +1289,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
- } else if (spage_bad->io_error) {
- /* try to find no-io-error page in mirrors */
+ } else if (sector_bad->io_error) {
+ /* Try to find no-io-error sector in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
+ sblocks_for_recheck[mirror_index]->sector_count > 0;
mirror_index++) {
- if (!sblocks_for_recheck[mirror_index].
- pagev[page_num]->io_error) {
- sblock_other = sblocks_for_recheck +
- mirror_index;
+ if (!sblocks_for_recheck[mirror_index]->
+ sectors[sector_num]->io_error) {
+ sblock_other = sblocks_for_recheck[mirror_index];
break;
}
}
@@ -1115,27 +1307,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (sctx->is_dev_replace) {
/*
- * did not find a mirror to fetch the page
- * from. scrub_write_page_to_dev_replace()
- * handles this case (page->io_error), by
- * filling the block with zeros before
- * submitting the write request
+ * Did not find a mirror to fetch the sector from.
+ * scrub_write_sector_to_dev_replace() handles this
+ * case (sector->io_error), by filling the block with
+ * zeros before submitting the write request
*/
if (!sblock_other)
sblock_other = sblock_bad;
- if (scrub_write_page_to_dev_replace(sblock_other,
- page_num) != 0) {
+ if (scrub_write_sector_to_dev_replace(sblock_other,
+ sector_num) != 0) {
atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
} else if (sblock_other) {
- ret = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_other,
- page_num, 0);
+ ret = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_other,
+ sector_num, 0);
if (0 == ret)
- spage_bad->io_error = 0;
+ sector_bad->io_error = 0;
else
success = 0;
}
@@ -1180,27 +1371,28 @@ did_not_correct_error:
}
out:
- if (sblocks_for_recheck) {
- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
- mirror_index++) {
- struct scrub_block *sblock = sblocks_for_recheck +
- mirror_index;
- struct scrub_recover *recover;
- int page_index;
-
- for (page_index = 0; page_index < sblock->page_count;
- page_index++) {
- sblock->pagev[page_index]->sblock = NULL;
- recover = sblock->pagev[page_index]->recover;
- if (recover) {
- scrub_put_recover(fs_info, recover);
- sblock->pagev[page_index]->recover =
- NULL;
- }
- scrub_page_put(sblock->pagev[page_index]);
+ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+ struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
+ struct scrub_recover *recover;
+ int sector_index;
+
+ /* Not allocated, continue checking the next mirror */
+ if (!sblock)
+ continue;
+
+ for (sector_index = 0; sector_index < sblock->sector_count;
+ sector_index++) {
+ /*
+ * Here we just cleanup the recover, each sector will be
+ * properly cleaned up by later scrub_block_put()
+ */
+ recover = sblock->sectors[sector_index]->recover;
+ if (recover) {
+ scrub_put_recover(fs_info, recover);
+ sblock->sectors[sector_index]->recover = NULL;
}
}
- kfree(sblocks_for_recheck);
+ scrub_block_put(sblock);
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
@@ -1222,7 +1414,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
u64 *raid_map,
- u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
@@ -1237,7 +1428,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
continue;
if (logical >= raid_map[i] &&
- logical < raid_map[i] + mapped_length)
+ logical < raid_map[i] + BTRFS_STRIPE_LEN)
break;
}
@@ -1251,32 +1442,26 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
- struct scrub_block *sblocks_for_recheck)
+ struct scrub_block *sblocks_for_recheck[])
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = original_sblock->page_count * fs_info->sectorsize;
- u64 logical = original_sblock->pagev[0]->logical;
- u64 generation = original_sblock->pagev[0]->generation;
- u64 flags = original_sblock->pagev[0]->flags;
- u64 have_csum = original_sblock->pagev[0]->have_csum;
+ u64 logical = original_sblock->logical;
+ u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
+ u64 generation = original_sblock->sectors[0]->generation;
+ u64 flags = original_sblock->sectors[0]->flags;
+ u64 have_csum = original_sblock->sectors[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
- int page_index = 0;
+ int sector_index = 0;
int mirror_index;
int nmirrors;
int ret;
- /*
- * note: the two members refs and outstanding_pages
- * are not used (and not set) in the blocks that are used for
- * the recheck procedure
- */
-
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
@@ -1306,70 +1491,63 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
recover->bioc = bioc;
recover->map_length = mapped_length;
- ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
+ ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
- struct scrub_page *spage;
+ struct scrub_sector *sector;
- sblock = sblocks_for_recheck + mirror_index;
+ sblock = sblocks_for_recheck[mirror_index];
sblock->sctx = sctx;
- spage = kzalloc(sizeof(*spage), GFP_NOFS);
- if (!spage) {
-leave_nomem:
+ sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
+ if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_page_get(spage);
- sblock->pagev[page_index] = spage;
- spage->sblock = sblock;
- spage->flags = flags;
- spage->generation = generation;
- spage->logical = logical;
- spage->have_csum = have_csum;
+ sector->flags = flags;
+ sector->generation = generation;
+ sector->have_csum = have_csum;
if (have_csum)
- memcpy(spage->csum,
- original_sblock->pagev[0]->csum,
+ memcpy(sector->csum,
+ original_sblock->sectors[0]->csum,
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
bioc->map_type,
bioc->raid_map,
- mapped_length,
bioc->num_stripes -
bioc->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
- spage->physical = bioc->stripes[stripe_index].physical +
- stripe_offset;
- spage->dev = bioc->stripes[stripe_index].dev;
-
- BUG_ON(page_index >= original_sblock->page_count);
- spage->physical_for_dev_replace =
- original_sblock->pagev[page_index]->
- physical_for_dev_replace;
- /* for missing devices, dev->bdev is NULL */
- spage->mirror_num = mirror_index + 1;
- sblock->page_count++;
- spage->page = alloc_page(GFP_NOFS);
- if (!spage->page)
- goto leave_nomem;
+ /*
+ * We're at the first sector, also populate @sblock
+ * physical and dev.
+ */
+ if (sector_index == 0) {
+ sblock->physical =
+ bioc->stripes[stripe_index].physical +
+ stripe_offset;
+ sblock->dev = bioc->stripes[stripe_index].dev;
+ sblock->physical_for_dev_replace =
+ original_sblock->physical_for_dev_replace;
+ }
+ BUG_ON(sector_index >= original_sblock->sector_count);
scrub_get_recover(recover);
- spage->recover = recover;
+ sector->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
- page_index++;
+ sector_index++;
}
return 0;
@@ -1382,22 +1560,15 @@ static void scrub_bio_wait_endio(struct bio *bio)
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
- struct scrub_page *spage)
+ struct scrub_sector *sector)
{
DECLARE_COMPLETION_ONSTACK(done);
- int ret;
- int mirror_num;
- bio->bi_iter.bi_sector = spage->logical >> 9;
+ bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
+ SECTOR_SHIFT;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
-
- mirror_num = spage->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(bio, spage->recover->bioc,
- spage->recover->map_length,
- mirror_num, 0);
- if (ret)
- return ret;
+ raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
@@ -1406,26 +1577,24 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock)
{
- struct scrub_page *first_page = sblock->pagev[0];
+ struct scrub_sector *first_sector = sblock->sectors[0];
struct bio *bio;
- int page_num;
+ int i;
- /* All pages in sblock belong to the same stripe on the same device. */
- ASSERT(first_page->dev);
- if (!first_page->dev->bdev)
+ /* All sectors in sblock belong to the same stripe on the same device. */
+ ASSERT(sblock->dev);
+ if (!sblock->dev->bdev)
goto out;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
- bio_set_dev(bio, first_page->dev->bdev);
+ bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct scrub_page *spage = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- WARN_ON(!spage->page);
- bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
}
- if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
bio_put(bio);
goto out;
}
@@ -1436,65 +1605,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
return;
out:
- for (page_num = 0; page_num < sblock->page_count; page_num++)
- sblock->pagev[page_num]->io_error = 1;
+ for (i = 0; i < sblock->sector_count; i++)
+ sblock->sectors[i]->io_error = 1;
sblock->no_io_error_seen = 0;
}
/*
- * this function will check the on disk data for checksum errors, header
- * errors and read I/O errors. If any I/O errors happen, the exact pages
- * which are errored are marked as being bad. The goal is to enable scrub
- * to take those pages that are not errored from all the mirrors so that
- * the pages that are errored in the just handled mirror can be repaired.
+ * This function will check the on disk data for checksum errors, header errors
+ * and read I/O errors. If any I/O errors happen, the exact sectors which are
+ * errored are marked as being bad. The goal is to enable scrub to take those
+ * sectors that are not errored from all the mirrors so that the sectors that
+ * are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror)
{
- int page_num;
+ int i;
sblock->no_io_error_seen = 1;
/* short cut for raid56 */
- if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
return scrub_recheck_block_on_raid56(fs_info, sblock);
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct bio *bio;
- struct scrub_page *spage = sblock->pagev[page_num];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
+ struct bio bio;
+ struct bio_vec bvec;
- if (spage->dev->bdev == NULL) {
- spage->io_error = 1;
+ if (sblock->dev->bdev == NULL) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!spage->page);
- bio = btrfs_bio_alloc(1);
- bio_set_dev(bio, spage->dev->bdev);
+ bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
+ bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
+ bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
+ SECTOR_SHIFT;
- bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
- bio->bi_iter.bi_sector = spage->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
-
- if (btrfsic_submit_bio_wait(bio)) {
- spage->io_error = 1;
+ btrfsic_check_bio(&bio);
+ if (submit_bio_wait(&bio)) {
+ sector->io_error = 1;
sblock->no_io_error_seen = 0;
}
- bio_put(bio);
+ bio_uninit(&bio);
}
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
}
-static inline int scrub_check_fsid(u8 fsid[],
- struct scrub_page *spage)
+static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
{
- struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
+ struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1507,7 +1674,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
sblock->checksum_error = 0;
sblock->generation_error = 0;
- if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
scrub_checksum_data(sblock);
else
scrub_checksum_tree_block(sblock);
@@ -1516,15 +1683,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock)
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good)
{
- int page_num;
+ int i;
int ret = 0;
- for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ for (i = 0; i < sblock_bad->sector_count; i++) {
int ret_sub;
- ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
- sblock_good,
- page_num, 1);
+ ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
+ sblock_good, i, 1);
if (ret_sub)
ret = ret_sub;
}
@@ -1532,47 +1698,42 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
return ret;
}
-static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int page_num, int force_write)
+static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+ struct scrub_block *sblock_good,
+ int sector_num, int force_write)
{
- struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
- struct scrub_page *spage_good = sblock_good->pagev[page_num];
+ struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+ struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
const u32 sectorsize = fs_info->sectorsize;
- BUG_ON(spage_bad->page == NULL);
- BUG_ON(spage_good->page == NULL);
if (force_write || sblock_bad->header_error ||
- sblock_bad->checksum_error || spage_bad->io_error) {
- struct bio *bio;
+ sblock_bad->checksum_error || sector_bad->io_error) {
+ struct bio bio;
+ struct bio_vec bvec;
int ret;
- if (!spage_bad->dev->bdev) {
+ if (!sblock_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
- bio = btrfs_bio_alloc(1);
- bio_set_dev(bio, spage_bad->dev->bdev);
- bio->bi_iter.bi_sector = spage_bad->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+ bio.bi_iter.bi_sector = (sblock_bad->physical +
+ sector_bad->offset) >> SECTOR_SHIFT;
+ ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
- ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
- if (ret != sectorsize) {
- bio_put(bio);
- return -EIO;
- }
+ btrfsic_check_bio(&bio);
+ ret = submit_bio_wait(&bio);
+ bio_uninit(&bio);
- if (btrfsic_submit_bio_wait(bio)) {
- btrfs_dev_stat_inc_and_print(spage_bad->dev,
+ if (ret) {
+ btrfs_dev_stat_inc_and_print(sblock_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
- bio_put(bio);
return -EIO;
}
- bio_put(bio);
}
return 0;
@@ -1581,7 +1742,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
- int page_num;
+ int i;
/*
* This block is used for the check of the parity on the source device,
@@ -1590,25 +1751,24 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
if (sblock->sparity)
return;
- for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ for (i = 0; i < sblock->sector_count; i++) {
int ret;
- ret = scrub_write_page_to_dev_replace(sblock, page_num);
+ ret = scrub_write_sector_to_dev_replace(sblock, i);
if (ret)
atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
-static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
- int page_num)
+static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
{
- struct scrub_page *spage = sblock->pagev[page_num];
+ const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
+ struct scrub_sector *sector = sblock->sectors[sector_num];
- BUG_ON(spage->page == NULL);
- if (spage->io_error)
- clear_page(page_address(spage->page));
+ if (sector->io_error)
+ memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
- return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+ return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
@@ -1633,9 +1793,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
-static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static void scrub_block_get(struct scrub_block *sblock)
{
+ refcount_inc(&sblock->refs);
+}
+
+static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
+{
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
int ret;
const u32 sectorsize = sctx->fs_info->sectorsize;
@@ -1650,45 +1816,39 @@ again:
return -ENOMEM;
}
sctx->wr_curr_bio->sctx = sctx;
- sctx->wr_curr_bio->page_count = 0;
+ sctx->wr_curr_bio->sector_count = 0;
}
sbio = sctx->wr_curr_bio;
- if (sbio->page_count == 0) {
- struct bio *bio;
-
- ret = fill_writer_pointer_gap(sctx,
- spage->physical_for_dev_replace);
+ if (sbio->sector_count == 0) {
+ ret = fill_writer_pointer_gap(sctx, sector->offset +
+ sblock->physical_for_dev_replace);
if (ret) {
mutex_unlock(&sctx->wr_lock);
return ret;
}
- sbio->physical = spage->physical_for_dev_replace;
- sbio->logical = spage->logical;
+ sbio->physical = sblock->physical_for_dev_replace + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
sbio->dev = sctx->wr_tgtdev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_bio);
- sbio->bio = bio;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_WRITE, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_wr_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_WRITE;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_wr_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * sectorsize !=
- spage->physical_for_dev_replace ||
- sbio->logical + sbio->page_count * sectorsize !=
- spage->logical) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sblock->physical_for_dev_replace + sector->offset ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sblock->logical + sector->offset) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
if (ret != sectorsize) {
- if (sbio->page_count < 1) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
mutex_unlock(&sctx->wr_lock);
@@ -1698,10 +1858,17 @@ again:
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- scrub_page_get(spage);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_bio)
+ sbio->sectors[sbio->sector_count] = sector;
+ scrub_sector_get(sector);
+ /*
+ * Since ssector no longer holds a page, but uses sblock::pages, we
+ * have to ensure the sblock had not been freed before our write bio
+ * finished.
+ */
+ scrub_block_get(sector->sblock);
+
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
@@ -1717,16 +1884,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
- WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
if (btrfs_is_zoned(sctx->fs_info))
- sctx->write_pointer = sbio->physical + sbio->page_count *
+ sctx->write_pointer = sbio->physical + sbio->sector_count *
sctx->fs_info->sectorsize;
}
@@ -1738,31 +1905,37 @@ static void scrub_wr_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
- btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+ INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
+ queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_wr_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
+ sector->io_error = 1;
atomic64_inc(&dev_replace->num_write_errors);
}
}
- for (i = 0; i < sbio->page_count; i++)
- scrub_page_put(sbio->pagev[i]);
+ /*
+ * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
+ * endio we should put the sblock.
+ */
+ for (i = 0; i < sbio->sector_count; i++) {
+ scrub_block_put(sbio->sectors[i]->sblock);
+ scrub_sector_put(sbio->sectors[i]);
+ }
bio_put(sbio->bio);
kfree(sbio);
@@ -1786,15 +1959,15 @@ static int scrub_checksum(struct scrub_block *sblock)
sblock->generation_error = 0;
sblock->checksum_error = 0;
- WARN_ON(sblock->page_count < 1);
- flags = sblock->pagev[0]->flags;
+ WARN_ON(sblock->sector_count < 1);
+ flags = sblock->sectors[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = scrub_checksum_tree_block(sblock);
else if (flags & BTRFS_EXTENT_FLAG_SUPER)
- (void)scrub_checksum_super(sblock);
+ ret = scrub_checksum_super(sblock);
else
WARN_ON(1);
if (ret)
@@ -1809,26 +1982,22 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
- BUG_ON(sblock->page_count < 1);
- spage = sblock->pagev[0];
- if (!spage->have_csum)
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ if (!sector->have_csum)
return 0;
- kaddr = page_address(spage->page);
+ kaddr = scrub_sector_get_kaddr(sector);
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- /*
- * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
- * only contains one sector of data.
- */
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- if (memcmp(csum, spage->csum, fs_info->csum_size))
+ if (memcmp(csum, sector->csum, fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->checksum_error;
}
@@ -1849,16 +2018,16 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
const u32 sectorsize = sctx->fs_info->sectorsize;
const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
int i;
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
- BUG_ON(sblock->page_count < 1);
+ BUG_ON(sblock->sector_count < 1);
- /* Each member in pagev is just one block, not a full page */
- ASSERT(sblock->page_count == num_sectors);
+ /* Each member in sectors is just one sector */
+ ASSERT(sblock->sector_count == num_sectors);
- spage = sblock->pagev[0];
- kaddr = page_address(spage->page);
+ sector = sblock->sectors[0];
+ kaddr = scrub_sector_get_kaddr(sector);
h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
@@ -1867,15 +2036,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
- if (spage->logical != btrfs_stack_header_bytenr(h))
+ if (sblock->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
- if (spage->generation != btrfs_stack_header_generation(h)) {
+ if (sector->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
- if (!scrub_check_fsid(h->fsid, spage))
+ if (!scrub_check_fsid(h->fsid, sector))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
@@ -1888,7 +2057,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
sectorsize - BTRFS_CSUM_SIZE);
for (i = 1; i < num_sectors; i++) {
- kaddr = page_address(sblock->pagev[i]->page);
+ kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
crypto_shash_update(shash, kaddr, sectorsize);
}
@@ -1906,23 +2075,23 @@ static int scrub_checksum_super(struct scrub_block *sblock)
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
- struct scrub_page *spage;
+ struct scrub_sector *sector;
char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
- BUG_ON(sblock->page_count < 1);
- spage = sblock->pagev[0];
- kaddr = page_address(spage->page);
+ BUG_ON(sblock->sector_count < 1);
+ sector = sblock->sectors[0];
+ kaddr = scrub_sector_get_kaddr(sector);
s = (struct btrfs_super_block *)kaddr;
- if (spage->logical != btrfs_super_bytenr(s))
+ if (sblock->logical != btrfs_super_bytenr(s))
++fail_cor;
- if (spage->generation != btrfs_super_generation(s))
+ if (sector->generation != btrfs_super_generation(s))
++fail_gen;
- if (!scrub_check_fsid(s->fsid, spage))
+ if (!scrub_check_fsid(s->fsid, sector))
++fail_cor;
shash->tfm = fs_info->csum_shash;
@@ -1933,31 +2102,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
++fail_cor;
- if (fail_cor + fail_gen) {
- /*
- * if we find an error in a super block, we just report it.
- * They will get written with the next transaction commit
- * anyway
- */
- spin_lock(&sctx->stat_lock);
- ++sctx->stat.super_errors;
- spin_unlock(&sctx->stat_lock);
- if (fail_cor)
- btrfs_dev_stat_inc_and_print(spage->dev,
- BTRFS_DEV_STAT_CORRUPTION_ERRS);
- else
- btrfs_dev_stat_inc_and_print(spage->dev,
- BTRFS_DEV_STAT_GENERATION_ERRS);
- }
-
return fail_cor + fail_gen;
}
-static void scrub_block_get(struct scrub_block *sblock)
-{
- refcount_inc(&sblock->refs);
-}
-
static void scrub_block_put(struct scrub_block *sblock)
{
if (refcount_dec_and_test(&sblock->refs)) {
@@ -1966,24 +2113,27 @@ static void scrub_block_put(struct scrub_block *sblock)
if (sblock->sparity)
scrub_parity_put(sblock->sparity);
- for (i = 0; i < sblock->page_count; i++)
- scrub_page_put(sblock->pagev[i]);
+ for (i = 0; i < sblock->sector_count; i++)
+ scrub_sector_put(sblock->sectors[i]);
+ for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
+ if (sblock->pages[i]) {
+ detach_scrub_page_private(sblock->pages[i]);
+ __free_page(sblock->pages[i]);
+ }
+ }
kfree(sblock);
}
}
-static void scrub_page_get(struct scrub_page *spage)
+static void scrub_sector_get(struct scrub_sector *sector)
{
- atomic_inc(&spage->refs);
+ atomic_inc(&sector->refs);
}
-static void scrub_page_put(struct scrub_page *spage)
+static void scrub_sector_put(struct scrub_sector *sector)
{
- if (atomic_dec_and_test(&spage->refs)) {
- if (spage->page)
- __free_page(spage->page);
- kfree(spage);
- }
+ if (atomic_dec_and_test(&sector->refs))
+ kfree(sector);
}
/*
@@ -2057,13 +2207,14 @@ static void scrub_submit(struct scrub_ctx *sctx)
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
- btrfsic_submit_bio(sbio->bio);
+ btrfsic_check_bio(sbio->bio);
+ submit_bio(sbio->bio);
}
-static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage)
+static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
+ struct scrub_sector *sector)
{
- struct scrub_block *sblock = spage->sblock;
+ struct scrub_block *sblock = sector->sblock;
struct scrub_bio *sbio;
const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
@@ -2078,7 +2229,7 @@ again:
if (sctx->curr != -1) {
sctx->first_free = sctx->bios[sctx->curr]->next_free;
sctx->bios[sctx->curr]->next_free = -1;
- sctx->bios[sctx->curr]->page_count = 0;
+ sctx->bios[sctx->curr]->sector_count = 0;
spin_unlock(&sctx->list_lock);
} else {
spin_unlock(&sctx->list_lock);
@@ -2086,37 +2237,31 @@ again:
}
}
sbio = sctx->bios[sctx->curr];
- if (sbio->page_count == 0) {
- struct bio *bio;
-
- sbio->physical = spage->physical;
- sbio->logical = spage->logical;
- sbio->dev = spage->dev;
- bio = sbio->bio;
- if (!bio) {
- bio = btrfs_bio_alloc(sctx->pages_per_bio);
- sbio->bio = bio;
+ if (sbio->sector_count == 0) {
+ sbio->physical = sblock->physical + sector->offset;
+ sbio->logical = sblock->logical + sector->offset;
+ sbio->dev = sblock->dev;
+ if (!sbio->bio) {
+ sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+ REQ_OP_READ, GFP_NOFS);
}
-
- bio->bi_private = sbio;
- bio->bi_end_io = scrub_bio_end_io;
- bio_set_dev(bio, sbio->dev->bdev);
- bio->bi_iter.bi_sector = sbio->physical >> 9;
- bio->bi_opf = REQ_OP_READ;
+ sbio->bio->bi_private = sbio;
+ sbio->bio->bi_end_io = scrub_bio_end_io;
+ sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * sectorsize !=
- spage->physical ||
- sbio->logical + sbio->page_count * sectorsize !=
- spage->logical ||
- sbio->dev != spage->dev) {
+ } else if (sbio->physical + sbio->sector_count * sectorsize !=
+ sblock->physical + sector->offset ||
+ sbio->logical + sbio->sector_count * sectorsize !=
+ sblock->logical + sector->offset ||
+ sbio->dev != sblock->dev) {
scrub_submit(sctx);
goto again;
}
- sbio->pagev[sbio->page_count] = spage;
- ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ sbio->sectors[sbio->sector_count] = sector;
+ ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
if (ret != sectorsize) {
- if (sbio->page_count < 1) {
+ if (sbio->sector_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
return -EIO;
@@ -2126,9 +2271,9 @@ again:
}
scrub_block_get(sblock); /* one for the page added to the bio */
- atomic_inc(&sblock->outstanding_pages);
- sbio->page_count++;
- if (sbio->page_count == sctx->pages_per_bio)
+ atomic_inc(&sblock->outstanding_sectors);
+ sbio->sector_count++;
+ if (sbio->sector_count == sctx->sectors_per_bio)
scrub_submit(sctx);
return 0;
@@ -2139,15 +2284,16 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
struct scrub_block *sblock = bio->bi_private;
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+ btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
sblock->no_io_error_seen = 0;
bio_put(bio);
- btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+ queue_work(fs_info->scrub_workers, &sblock->work);
}
-static void scrub_missing_raid56_worker(struct btrfs_work *work)
+static void scrub_missing_raid56_worker(struct work_struct *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
@@ -2155,8 +2301,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
u64 logical;
struct btrfs_device *dev;
- logical = sblock->pagev[0]->logical;
- dev = sblock->pagev[0]->dev;
+ logical = sblock->logical;
+ dev = sblock->dev;
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
@@ -2193,8 +2339,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = sblock->page_count * PAGE_SIZE;
- u64 logical = sblock->pagev[0]->logical;
+ u64 length = sblock->sector_count << fs_info->sectorsize_bits;
+ u64 logical = sblock->logical;
struct btrfs_io_context *bioc = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
@@ -2213,30 +2359,33 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
- * there's a bug in scrub_remap_extent()/btrfs_map_block().
+ * there's a bug in scrub_find_good_copy()/btrfs_map_block().
*/
goto bioc_out;
}
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
- rbio = raid56_alloc_missing_rbio(bio, bioc, length);
+ rbio = raid56_alloc_missing_rbio(bio, bioc);
if (!rbio)
goto rbio_out;
- for (i = 0; i < sblock->page_count; i++) {
- struct scrub_page *spage = sblock->pagev[i];
+ for (i = 0; i < sblock->sector_count; i++) {
+ struct scrub_sector *sector = sblock->sectors[i];
- raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+ raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
+ scrub_sector_get_page_offset(sector),
+ sector->offset + sector->sblock->logical);
}
- btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
+ INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
+ btrfs_put_bioc(bioc);
return;
rbio_out:
@@ -2249,7 +2398,7 @@ bioc_out:
spin_unlock(&sctx->stat_lock);
}
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
+static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace)
@@ -2258,7 +2407,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ sblock = alloc_scrub_block(sctx, dev, logical, physical,
+ physical_for_dev_replace, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2266,14 +2416,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
-
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
+ struct scrub_sector *sector;
/*
* Here we will allocate one page for one sector to scrub.
* This is fine if PAGE_SIZE == sectorsize, but will cost
@@ -2281,43 +2425,29 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
*/
u32 l = min(sectorsize, len);
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
-leave_nomem:
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->physical_for_dev_replace = physical_for_dev_replace;
- spage->mirror_num = mirror_num;
+ sector->flags = flags;
+ sector->generation = gen;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->fs_info->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
- goto leave_nomem;
len -= l;
logical += l;
physical += l;
physical_for_dev_replace += l;
}
- WARN_ON(sblock->page_count == 0);
+ WARN_ON(sblock->sector_count == 0);
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
/*
* This case should only be hit for RAID 5/6 device replace. See
@@ -2325,11 +2455,11 @@ leave_nomem:
*/
scrub_missing_raid56_pages(sblock);
} else {
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
@@ -2353,31 +2483,31 @@ static void scrub_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
+ queue_work(fs_info->scrub_workers, &sbio->work);
}
-static void scrub_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_bio_end_io_worker(struct work_struct *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
- ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
+ ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
if (sbio->status) {
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
- spage->io_error = 1;
- spage->sblock->no_io_error_seen = 0;
+ sector->io_error = 1;
+ sector->sblock->no_io_error_seen = 0;
}
}
- /* now complete the scrub_block items that have all pages completed */
- for (i = 0; i < sbio->page_count; i++) {
- struct scrub_page *spage = sbio->pagev[i];
- struct scrub_block *sblock = spage->sblock;
+ /* Now complete the scrub_block items that have all pages completed */
+ for (i = 0; i < sbio->sector_count; i++) {
+ struct scrub_sector *sector = sbio->sectors[i];
+ struct scrub_block *sblock = sector->sblock;
- if (atomic_dec_and_test(&sblock->outstanding_pages))
+ if (atomic_dec_and_test(&sblock->outstanding_sectors))
scrub_block_complete(sblock);
scrub_block_put(sblock);
}
@@ -2428,13 +2558,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
u64 start, u32 len)
{
- __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
+ __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
}
static void scrub_block_complete(struct scrub_block *sblock)
@@ -2456,8 +2586,9 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
- u64 start = sblock->pagev[0]->logical;
- u64 end = sblock->pagev[sblock->page_count - 1]->logical +
+ u64 start = sblock->logical;
+ u64 end = sblock->logical +
+ sblock->sectors[sblock->sector_count - 1]->offset +
sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
@@ -2532,17 +2663,26 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u64 physical_for_dev_replace)
+ u64 gen, int mirror_num)
{
+ struct btrfs_device *src_dev = dev;
+ u64 src_physical = physical;
+ int src_mirror = mirror_num;
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
+ /*
+ * Block size determines how many scrub_block will be allocated. Here
+ * we use BTRFS_STRIPE_LEN (64KiB) as default limit, so we won't
+ * allocate too many scrub_block, while still won't cause too large
+ * bios for large extents.
+ */
if (flags & BTRFS_EXTENT_FLAG_DATA) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
- blocksize = sctx->fs_info->sectorsize;
+ blocksize = BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
@@ -2561,6 +2701,18 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
WARN_ON(1);
}
+ /*
+ * For dev-replace case, we can have @dev being a missing device.
+ * Regular scrub will avoid its execution on missing device at all,
+ * as that would trigger tons of read error.
+ *
+ * Reading from missing device will cause read error counts to
+ * increase unnecessarily.
+ * So here we change the read source to a good mirror.
+ */
+ if (sctx->is_dev_replace && !dev->bdev)
+ scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
+ &src_dev, &src_mirror);
while (len) {
u32 l = min(len, blocksize);
int have_csum = 0;
@@ -2571,20 +2723,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
if (have_csum == 0)
++sctx->stat.no_csum;
}
- ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
- mirror_num, have_csum ? csum : NULL,
- physical_for_dev_replace);
+ ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
+ flags, gen, src_mirror,
+ have_csum ? csum : NULL, physical);
if (ret)
return ret;
len -= l;
logical += l;
physical += l;
- physical_for_dev_replace += l;
+ src_physical += l;
}
return 0;
}
-static int scrub_pages_for_parity(struct scrub_parity *sparity,
+static int scrub_sectors_for_parity(struct scrub_parity *sparity,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
@@ -2596,7 +2748,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
ASSERT(IS_ALIGNED(len, sectorsize));
- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+ sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2604,51 +2756,32 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page added to
- * a bio later on */
- refcount_set(&sblock->refs, 1);
- sblock->sctx = sctx;
- sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
- struct scrub_page *spage;
+ struct scrub_sector *sector;
- spage = kzalloc(sizeof(*spage), GFP_KERNEL);
- if (!spage) {
-leave_nomem:
+ sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
+ if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
- ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
- /* For scrub block */
- scrub_page_get(spage);
- sblock->pagev[index] = spage;
+ sblock->sectors[index] = sector;
/* For scrub parity */
- scrub_page_get(spage);
- list_add_tail(&spage->list, &sparity->spages);
- spage->sblock = sblock;
- spage->dev = dev;
- spage->flags = flags;
- spage->generation = gen;
- spage->logical = logical;
- spage->physical = physical;
- spage->mirror_num = mirror_num;
+ scrub_sector_get(sector);
+ list_add_tail(&sector->list, &sparity->sectors_list);
+ sector->flags = flags;
+ sector->generation = gen;
if (csum) {
- spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->fs_info->csum_size);
+ sector->have_csum = 1;
+ memcpy(sector->csum, csum, sctx->fs_info->csum_size);
} else {
- spage->have_csum = 0;
+ sector->have_csum = 0;
}
- sblock->page_count++;
- spage->page = alloc_page(GFP_KERNEL);
- if (!spage->page)
- goto leave_nomem;
-
/* Iterate over the stripe range in sectorsize steps */
len -= sectorsize;
@@ -2656,19 +2789,19 @@ leave_nomem:
physical += sectorsize;
}
- WARN_ON(sblock->page_count == 0);
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
+ WARN_ON(sblock->sector_count == 0);
+ for (index = 0; index < sblock->sector_count; index++) {
+ struct scrub_sector *sector = sblock->sectors[index];
int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
+ ret = scrub_add_sector_to_rd_bio(sctx, sector);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
- /* last one frees, either here or in bio completion for last page */
+ /* Last one frees, either here or in bio completion for last sector */
scrub_block_put(sblock);
return 0;
}
@@ -2707,7 +2840,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (have_csum == 0)
goto skip;
}
- ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
+ ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
flags, gen, mirror_num,
have_csum ? csum : NULL);
if (ret)
@@ -2767,10 +2900,10 @@ static int get_raid56_logic_offset(u64 physical, int num,
static void scrub_free_parity(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
- struct scrub_page *curr, *next;
+ struct scrub_sector *curr, *next;
int nbits;
- nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
+ nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
if (nbits) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors += nbits;
@@ -2778,38 +2911,38 @@ static void scrub_free_parity(struct scrub_parity *sparity)
spin_unlock(&sctx->stat_lock);
}
- list_for_each_entry_safe(curr, next, &sparity->spages, list) {
+ list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
list_del_init(&curr->list);
- scrub_page_put(curr);
+ scrub_sector_put(curr);
}
kfree(sparity);
}
-static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+static void scrub_parity_bio_endio_worker(struct work_struct *work)
{
struct scrub_parity *sparity = container_of(work, struct scrub_parity,
work);
struct scrub_ctx *sctx = sparity->sctx;
+ btrfs_bio_counter_dec(sctx->fs_info);
scrub_free_parity(sparity);
scrub_pending_bio_dec(sctx);
}
static void scrub_parity_bio_endio(struct bio *bio)
{
- struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
+ struct scrub_parity *sparity = bio->bi_private;
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_status)
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
- sparity->nsectors);
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
+ &sparity->dbitmap, sparity->nsectors);
bio_put(bio);
- btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
- NULL);
- btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
+ INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
+ queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -2822,8 +2955,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
u64 length;
int ret;
- if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
- sparity->nsectors))
+ if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
+ &sparity->ebitmap, sparity->nsectors))
goto out;
length = sparity->logic_end - sparity->logic_start;
@@ -2834,15 +2967,16 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
if (ret || !bioc || !bioc->raid_map)
goto bioc_out;
- bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
sparity->scrub_dev,
- sparity->dbitmap,
+ &sparity->dbitmap,
sparity->nsectors);
+ btrfs_put_bioc(bioc);
if (!rbio)
goto rbio_out;
@@ -2854,8 +2988,7 @@ rbio_out:
bio_put(bio);
bioc_out:
btrfs_bio_counter_dec(fs_info);
- btrfs_put_bioc(bioc);
- bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
+ bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2864,11 +2997,6 @@ out:
scrub_free_parity(sparity);
}
-static inline int scrub_calc_parity_bitmap_len(int nsectors)
-{
- return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
-}
-
static void scrub_parity_get(struct scrub_parity *sparity)
{
refcount_inc(&sparity->refs);
@@ -2882,6 +3010,251 @@ static void scrub_parity_put(struct scrub_parity *sparity)
scrub_parity_check_and_repair(sparity);
}
+/*
+ * Return 0 if the extent item range covers any byte of the range.
+ * Return <0 if the extent item is before @search_start.
+ * Return >0 if the extent item is after @start_start + @search_len.
+ */
+static int compare_extent_item_range(struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
+ u64 len;
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY);
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ len = fs_info->nodesize;
+ else
+ len = key.offset;
+
+ if (key.objectid + len <= search_start)
+ return -1;
+ if (key.objectid >= search_start + search_len)
+ return 1;
+ return 0;
+}
+
+/*
+ * Locate one extent item which covers any byte in range
+ * [@search_start, @search_start + @search_length)
+ *
+ * If the path is not initialized, we will initialize the search by doing
+ * a btrfs_search_slot().
+ * If the path is already initialized, we will use the path as the initial
+ * slot, to avoid duplicated btrfs_search_slot() calls.
+ *
+ * NOTE: If an extent item starts before @search_start, we will still
+ * return the extent item. This is for data extent crossing stripe boundary.
+ *
+ * Return 0 if we found such extent item, and @path will point to the extent item.
+ * Return >0 if no such extent item can be found, and @path will be released.
+ * Return <0 if hit fatal error, and @path will be released.
+ */
+static int find_first_extent_item(struct btrfs_root *extent_root,
+ struct btrfs_path *path,
+ u64 search_start, u64 search_len)
+{
+ struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ struct btrfs_key key;
+ int ret;
+
+ /* Continue using the existing path */
+ if (path->nodes[0])
+ goto search_forward;
+
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.objectid = search_start;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(ret > 0);
+ /*
+ * Here we intentionally pass 0 as @min_objectid, as there could be
+ * an extent item starting before @search_start.
+ */
+ ret = btrfs_previous_extent_item(extent_root, path, 0);
+ if (ret < 0)
+ return ret;
+ /*
+ * No matter whether we have found an extent item, the next loop will
+ * properly do every check on the key.
+ */
+search_forward:
+ while (true) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid >= search_start + search_len)
+ break;
+ if (key.type != BTRFS_METADATA_ITEM_KEY &&
+ key.type != BTRFS_EXTENT_ITEM_KEY)
+ goto next;
+
+ ret = compare_extent_item_range(path, search_start, search_len);
+ if (ret == 0)
+ return ret;
+ if (ret > 0)
+ break;
+next:
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(extent_root, path);
+ if (ret) {
+ /* Either no more item or fatal error */
+ btrfs_release_path(path);
+ return ret;
+ }
+ }
+ }
+ btrfs_release_path(path);
+ return 1;
+}
+
+static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
+ u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
+{
+ struct btrfs_key key;
+ struct btrfs_extent_item *ei;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
+ key.type == BTRFS_EXTENT_ITEM_KEY);
+ *extent_start_ret = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ *size_ret = path->nodes[0]->fs_info->nodesize;
+ else
+ *size_ret = key.offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
+ *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
+ *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
+}
+
+static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
+ u64 boundary_start, u64 boudary_len)
+{
+ return (extent_start < boundary_start &&
+ extent_start + extent_len > boundary_start) ||
+ (extent_start < boundary_start + boudary_len &&
+ extent_start + extent_len > boundary_start + boudary_len);
+}
+
+static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
+ struct scrub_parity *sparity,
+ struct map_lookup *map,
+ struct btrfs_device *sdev,
+ struct btrfs_path *path,
+ u64 logical)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
+ u64 cur_logical = logical;
+ int ret;
+
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+
+ /* Path must not be populated */
+ ASSERT(!path->nodes[0]);
+
+ while (cur_logical < logical + map->stripe_len) {
+ struct btrfs_io_context *bioc = NULL;
+ struct btrfs_device *extent_dev;
+ u64 extent_start;
+ u64 extent_size;
+ u64 mapped_length;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 extent_physical;
+ u64 extent_mirror_num;
+
+ ret = find_first_extent_item(extent_root, path, cur_logical,
+ logical + map->stripe_len - cur_logical);
+ /* No more extent item in this data stripe */
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(path, &extent_start, &extent_size, &extent_flags,
+ &extent_gen);
+
+ /* Metadata should not cross stripe boundaries */
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_size,
+ logical, map->stripe_len)) {
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ extent_start, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += extent_size;
+ continue;
+ }
+
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /* Truncate the range inside this data stripe */
+ extent_size = min(extent_start + extent_size,
+ logical + map->stripe_len) - cur_logical;
+ extent_start = cur_logical;
+ ASSERT(extent_size <= U32_MAX);
+
+ scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
+
+ mapped_length = extent_size;
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
+ &mapped_length, &bioc, 0);
+ if (!ret && (!bioc || mapped_length < extent_size))
+ ret = -EIO;
+ if (ret) {
+ btrfs_put_bioc(bioc);
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+ extent_physical = bioc->stripes[0].physical;
+ extent_mirror_num = bioc->mirror_num;
+ extent_dev = bioc->stripes[0].dev;
+ btrfs_put_bioc(bioc);
+
+ ret = btrfs_lookup_csums_range(csum_root, extent_start,
+ extent_start + extent_size - 1,
+ &sctx->csum_list, 1, false);
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ ret = scrub_extent_for_parity(sparity, extent_start,
+ extent_size, extent_physical,
+ extent_dev, extent_flags,
+ extent_gen, extent_mirror_num);
+ scrub_free_csums(sctx);
+
+ if (ret) {
+ scrub_parity_mark_sectors_error(sparity, extent_start,
+ extent_size);
+ break;
+ }
+
+ cond_resched();
+ cur_logical += extent_size;
+ }
+ btrfs_release_path(path);
+ return ret;
+}
+
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
@@ -2889,28 +3262,11 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 logic_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
- struct btrfs_root *csum_root;
- struct btrfs_extent_item *extent;
- struct btrfs_io_context *bioc = NULL;
struct btrfs_path *path;
- u64 flags;
+ u64 cur_logical;
int ret;
- int slot;
- struct extent_buffer *l;
- struct btrfs_key key;
- u64 generation;
- u64 extent_logical;
- u64 extent_physical;
- /* Check the comment in scrub_stripe() for why u32 is enough here */
- u32 extent_len;
- u64 mapped_length;
- struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
- int bitmap_len;
- int extent_mirror_num;
- int stop_loop = 0;
path = btrfs_alloc_path();
if (!path) {
@@ -2924,9 +3280,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
ASSERT(map->stripe_len <= U32_MAX);
nsectors = map->stripe_len >> fs_info->sectorsize_bits;
- bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
- sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
- GFP_NOFS);
+ ASSERT(nsectors <= BITS_PER_LONG);
+ sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
if (!sparity) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2943,178 +3298,17 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
refcount_set(&sparity->refs, 1);
- INIT_LIST_HEAD(&sparity->spages);
- sparity->dbitmap = sparity->bitmap;
- sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
+ INIT_LIST_HEAD(&sparity->sectors_list);
ret = 0;
- while (logic_start < logic_end) {
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logic_start;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ for (cur_logical = logic_start; cur_logical < logic_end;
+ cur_logical += map->stripe_len) {
+ ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
+ sdev, path, cur_logical);
if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
-
- stop_loop = 0;
- while (1) {
- u64 bytes;
-
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
-
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
-
- if (key.objectid + bytes <= logic_start)
- goto next;
-
- if (key.objectid >= logic_end) {
- stop_loop = 1;
- break;
- }
-
- while (key.objectid >= logic_start + map->stripe_len)
- logic_start += map->stripe_len;
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logic_start ||
- key.objectid + bytes >
- logic_start + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logic_start);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-again:
- extent_logical = key.objectid;
- ASSERT(bytes <= U32_MAX);
- extent_len = bytes;
-
- if (extent_logical < logic_start) {
- extent_len -= logic_start - extent_logical;
- extent_logical = logic_start;
- }
-
- if (extent_logical + extent_len >
- logic_start + map->stripe_len)
- extent_len = logic_start + map->stripe_len -
- extent_logical;
-
- scrub_parity_mark_sectors_data(sparity, extent_logical,
- extent_len);
-
- mapped_length = extent_len;
- bioc = NULL;
- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
- extent_logical, &mapped_length, &bioc,
- 0);
- if (!ret) {
- if (!bioc || mapped_length < extent_len)
- ret = -EIO;
- }
- if (ret) {
- btrfs_put_bioc(bioc);
- goto out;
- }
- extent_physical = bioc->stripes[0].physical;
- extent_mirror_num = bioc->mirror_num;
- extent_dev = bioc->stripes[0].dev;
- btrfs_put_bioc(bioc);
-
- csum_root = btrfs_csum_root(fs_info, extent_logical);
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical + extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
-
- ret = scrub_extent_for_parity(sparity, extent_logical,
- extent_len,
- extent_physical,
- extent_dev, flags,
- generation,
- extent_mirror_num);
-
- scrub_free_csums(sctx);
-
- if (ret)
- goto out;
-
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- logic_start += map->stripe_len;
-
- if (logic_start >= logic_end) {
- stop_loop = 1;
- break;
- }
-
- if (logic_start < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
- }
-next:
- path->slots[0]++;
- }
-
- btrfs_release_path(path);
-
- if (stop_loop)
break;
-
- logic_start += map->stripe_len;
- }
-out:
- if (ret < 0) {
- ASSERT(logic_end - logic_start <= U32_MAX);
- scrub_parity_mark_sectors_error(sparity, logic_start,
- logic_end - logic_start);
}
+
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
@@ -3165,69 +3359,234 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
return ret;
}
+/*
+ * Scrub one range which can only has simple mirror based profile.
+ * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
+ * RAID0/RAID10).
+ *
+ * Since we may need to handle a subset of block group, we need @logical_start
+ * and @logical_length parameter.
+ */
+static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ u64 logical_start, u64 logical_length,
+ struct btrfs_device *device,
+ u64 physical, int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ const u64 logical_end = logical_start + logical_length;
+ /* An artificial limit, inherit from old scrub behavior */
+ const u32 max_length = SZ_64K;
+ struct btrfs_path path = { 0 };
+ u64 cur_logical = logical_start;
+ int ret;
+
+ /* The range must be inside the bg */
+ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
+
+ path.search_commit_root = 1;
+ path.skip_locking = 1;
+ /* Go through each extent items inside the logical range */
+ while (cur_logical < logical_end) {
+ u64 extent_start;
+ u64 extent_len;
+ u64 extent_flags;
+ u64 extent_gen;
+ u64 scrub_len;
+
+ /* Canceled? */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sctx->cancel_req)) {
+ ret = -ECANCELED;
+ break;
+ }
+ /* Paused? */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* Push queued extents */
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+ wait_event(sctx->list_wait,
+ atomic_read(&sctx->bios_in_flight) == 0);
+ sctx->flush_all_writes = false;
+ scrub_blocked_if_needed(fs_info);
+ }
+ /* Block group removed? */
+ spin_lock(&bg->lock);
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&bg->lock);
+
+ ret = find_first_extent_item(extent_root, &path, cur_logical,
+ logical_end - cur_logical);
+ if (ret > 0) {
+ /* No more extent, just update the accounting */
+ sctx->stat.last_physical = physical + logical_length;
+ ret = 0;
+ break;
+ }
+ if (ret < 0)
+ break;
+ get_extent_info(&path, &extent_start, &extent_len,
+ &extent_flags, &extent_gen);
+ /* Skip hole range which doesn't have any extent */
+ cur_logical = max(extent_start, cur_logical);
+
+ /*
+ * Scrub len has three limits:
+ * - Extent size limit
+ * - Scrub range limit
+ * This is especially imporatant for RAID0/RAID10 to reuse
+ * this function
+ * - Max scrub size limit
+ */
+ scrub_len = min(min(extent_start + extent_len,
+ logical_end), cur_logical + max_length) -
+ cur_logical;
+
+ if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = btrfs_lookup_csums_range(csum_root, cur_logical,
+ cur_logical + scrub_len - 1,
+ &sctx->csum_list, 1, false);
+ if (ret)
+ break;
+ }
+ if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ does_range_cross_boundary(extent_start, extent_len,
+ logical_start, logical_length)) {
+ btrfs_err(fs_info,
+"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
+ extent_start, logical_start, logical_end);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ cur_logical += scrub_len;
+ continue;
+ }
+ ret = scrub_extent(sctx, map, cur_logical, scrub_len,
+ cur_logical - logical_start + physical,
+ device, extent_flags, extent_gen,
+ mirror_num);
+ scrub_free_csums(sctx);
+ if (ret)
+ break;
+ if (sctx->is_dev_replace)
+ sync_replace_for_zoned(sctx);
+ cur_logical += scrub_len;
+ /* Don't hold CPU for too long time */
+ cond_resched();
+ }
+ btrfs_release_path(&path);
+ return ret;
+}
+
+/* Calculate the full stripe length for simple stripe based profiles */
+static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+
+ return map->num_stripes / map->sub_stripes * map->stripe_len;
+}
+
+/* Get the logical bytenr for the stripe */
+static u64 simple_stripe_get_logical(struct map_lookup *map,
+ struct btrfs_block_group *bg,
+ int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /*
+ * (stripe_index / sub_stripes) gives how many data stripes we need to
+ * skip.
+ */
+ return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
+}
+
+/* Get the mirror number for the stripe */
+static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+{
+ ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10));
+ ASSERT(stripe_index < map->num_stripes);
+
+ /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
+ return stripe_index % map->sub_stripes + 1;
+}
+
+static int scrub_simple_stripe(struct scrub_ctx *sctx,
+ struct btrfs_root *extent_root,
+ struct btrfs_root *csum_root,
+ struct btrfs_block_group *bg,
+ struct map_lookup *map,
+ struct btrfs_device *device,
+ int stripe_index)
+{
+ const u64 logical_increment = simple_stripe_full_stripe_len(map);
+ const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
+ const u64 orig_physical = map->stripes[stripe_index].physical;
+ const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
+ u64 cur_logical = orig_logical;
+ u64 cur_physical = orig_physical;
+ int ret = 0;
+
+ while (cur_logical < bg->start + bg->length) {
+ /*
+ * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
+ * just RAID1, so we can reuse scrub_simple_mirror() to scrub
+ * this stripe.
+ */
+ ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
+ cur_logical, map->stripe_len, device,
+ cur_physical, mirror_num);
+ if (ret)
+ return ret;
+ /* Skip to next stripe which belongs to the target device */
+ cur_logical += logical_increment;
+ /* For physical offset, we just go to next stripe */
+ cur_physical += map->stripe_len;
+ }
+ return ret;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct extent_map *em,
struct btrfs_device *scrub_dev,
- int stripe_index, u64 dev_extent_len)
+ int stripe_index)
{
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root;
struct btrfs_root *csum_root;
- struct btrfs_extent_item *extent;
struct blk_plug plug;
+ struct map_lookup *map = em->map_lookup;
+ const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
- u64 flags;
int ret;
- int slot;
- u64 nstripes;
- struct extent_buffer *l;
- u64 physical;
+ u64 physical = map->stripes[stripe_index].physical;
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
- u64 physical_end;
- u64 generation;
- int mirror_num;
- struct btrfs_key key;
+ /* The logical increment after finishing one stripe */
u64 increment;
+ /* Offset inside the chunk */
u64 offset;
- u64 extent_logical;
- u64 extent_physical;
- /*
- * Unlike chunk length, extent length should never go beyond
- * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
- */
- u32 extent_len;
u64 stripe_logical;
u64 stripe_end;
- struct btrfs_device *extent_dev;
- int extent_mirror_num;
int stop_loop = 0;
- physical = map->stripes[stripe_index].physical;
- offset = 0;
- nstripes = div64_u64(dev_extent_len, map->stripe_len);
- mirror_num = 1;
- increment = map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- offset = map->stripe_len * stripe_index;
- increment = map->stripe_len * map->num_stripes;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- int factor = map->num_stripes / map->sub_stripes;
- offset = map->stripe_len * (stripe_index / map->sub_stripes);
- increment = map->stripe_len * factor;
- mirror_num = stripe_index % map->sub_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- mirror_num = stripe_index % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- mirror_num = stripe_index % map->num_stripes + 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical, stripe_index, map, &offset,
- NULL);
- increment = map->stripe_len * nr_data_stripes(map);
- }
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3241,21 +3600,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
path->skip_locking = 1;
path->reada = READA_FORWARD;
- logical = chunk_logical + offset;
- physical_end = physical + nstripes * map->stripe_len;
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- get_raid56_logic_offset(physical_end, stripe_index,
- map, &logic_end, NULL);
- logic_end += chunk_logical;
- } else {
- logic_end = logical + increment * nstripes;
- }
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
- root = btrfs_extent_root(fs_info, logical);
- csum_root = btrfs_csum_root(fs_info, logical);
+ root = btrfs_extent_root(fs_info, bg->start);
+ csum_root = btrfs_csum_root(fs_info, bg->start);
/*
* collect all data csums for the stripe to avoid seeking during
@@ -3272,247 +3622,89 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
/*
- * now find all extents for each stripe and scrub them
+ * There used to be a big double loop to handle all profiles using the
+ * same routine, which grows larger and more gross over time.
+ *
+ * So here we handle each profile differently, so simpler profiles
+ * have simpler scrubbing function.
*/
- ret = 0;
- while (physical < physical_end) {
+ if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
- * canceled?
- */
- if (atomic_read(&fs_info->scrub_cancel_req) ||
- atomic_read(&sctx->cancel_req)) {
- ret = -ECANCELED;
- goto out;
- }
- /*
- * check to see if we have to pause
+ * Above check rules out all complex profile, the remaining
+ * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
+ * mirrored duplication without stripe.
+ *
+ * Only @physical and @mirror_num needs to calculated using
+ * @stripe_index.
*/
- if (atomic_read(&fs_info->scrub_pause_req)) {
- /* push queued extents */
- sctx->flush_all_writes = true;
- scrub_submit(sctx);
- mutex_lock(&sctx->wr_lock);
- scrub_wr_submit(sctx);
- mutex_unlock(&sctx->wr_lock);
- wait_event(sctx->list_wait,
- atomic_read(&sctx->bios_in_flight) == 0);
- sctx->flush_all_writes = false;
- scrub_blocked_if_needed(fs_info);
- }
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, stripe_index,
- map, &logical,
- &stripe_logical);
- logical += chunk_logical;
- if (ret) {
- /* it is parity strip */
- stripe_logical += chunk_logical;
- stripe_end = stripe_logical + increment;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto skip;
- }
- }
-
- if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
- key.type = BTRFS_METADATA_ITEM_KEY;
- else
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
- key.offset = (u64)-1;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- if (ret > 0) {
- ret = btrfs_previous_extent_item(root, path, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- /* there's no smaller item, so stick with the
- * larger one */
- btrfs_release_path(path);
- ret = btrfs_search_slot(NULL, root, &key,
- path, 0, 0);
- if (ret < 0)
- goto out;
- }
- }
-
- stop_loop = 0;
- while (1) {
- u64 bytes;
-
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto out;
-
- stop_loop = 1;
- break;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.type == BTRFS_METADATA_ITEM_KEY)
- bytes = fs_info->nodesize;
- else
- bytes = key.offset;
-
- if (key.objectid + bytes <= logical)
- goto next;
-
- if (key.objectid >= logical + map->stripe_len) {
- /* out of this device extent */
- if (key.objectid >= logic_end)
- stop_loop = 1;
- break;
- }
-
- /*
- * If our block group was removed in the meanwhile, just
- * stop scrubbing since there is no point in continuing.
- * Continuing would prevent reusing its device extents
- * for new block groups for a long time.
- */
- spin_lock(&bg->lock);
- if (bg->removed) {
- spin_unlock(&bg->lock);
- ret = 0;
- goto out;
- }
- spin_unlock(&bg->lock);
-
- extent = btrfs_item_ptr(l, slot,
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(l, extent);
- generation = btrfs_extent_generation(l, extent);
-
- if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
- (key.objectid < logical ||
- key.objectid + bytes >
- logical + map->stripe_len)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logical);
- spin_lock(&sctx->stat_lock);
- sctx->stat.uncorrectable_errors++;
- spin_unlock(&sctx->stat_lock);
- goto next;
- }
-
-again:
- extent_logical = key.objectid;
- ASSERT(bytes <= U32_MAX);
- extent_len = bytes;
-
- /*
- * trim extent to this stripe
- */
- if (extent_logical < logical) {
- extent_len -= logical - extent_logical;
- extent_logical = logical;
- }
- if (extent_logical + extent_len >
- logical + map->stripe_len) {
- extent_len = logical + map->stripe_len -
- extent_logical;
- }
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ bg->start, bg->length, scrub_dev,
+ map->stripes[stripe_index].physical,
+ stripe_index + 1);
+ offset = 0;
+ goto out;
+ }
+ if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
+ scrub_dev, stripe_index);
+ offset = map->stripe_len * (stripe_index / map->sub_stripes);
+ goto out;
+ }
- extent_physical = extent_logical - logical + physical;
- extent_dev = scrub_dev;
- extent_mirror_num = mirror_num;
- if (sctx->is_dev_replace)
- scrub_remap_extent(fs_info, extent_logical,
- extent_len, &extent_physical,
- &extent_dev,
- &extent_mirror_num);
-
- if (flags & BTRFS_EXTENT_FLAG_DATA) {
- ret = btrfs_lookup_csums_range(csum_root,
- extent_logical,
- extent_logical + extent_len - 1,
- &sctx->csum_list, 1);
- if (ret)
- goto out;
- }
+ /* Only RAID56 goes through the old code */
+ ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ ret = 0;
- ret = scrub_extent(sctx, map, extent_logical, extent_len,
- extent_physical, extent_dev, flags,
- generation, extent_mirror_num,
- extent_logical - logical + physical);
+ /* Calculate the logical end of the stripe */
+ get_raid56_logic_offset(physical_end, stripe_index,
+ map, &logic_end, NULL);
+ logic_end += chunk_logical;
- scrub_free_csums(sctx);
+ /* Initialize @offset in case we need to go to out: label */
+ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
+ increment = map->stripe_len * nr_data_stripes(map);
+ /*
+ * Due to the rotation, for RAID56 it's better to iterate each stripe
+ * using their physical offset.
+ */
+ while (physical < physical_end) {
+ ret = get_raid56_logic_offset(physical, stripe_index, map,
+ &logical, &stripe_logical);
+ logical += chunk_logical;
+ if (ret) {
+ /* it is parity strip */
+ stripe_logical += chunk_logical;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ stripe_logical,
+ stripe_end);
if (ret)
goto out;
+ goto next;
+ }
- if (sctx->is_dev_replace)
- sync_replace_for_zoned(sctx);
-
- if (extent_logical + extent_len <
- key.objectid + bytes) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- /*
- * loop until we find next data stripe
- * or we have finished all stripes.
- */
-loop:
- physical += map->stripe_len;
- ret = get_raid56_logic_offset(physical,
- stripe_index, map,
- &logical, &stripe_logical);
- logical += chunk_logical;
-
- if (ret && physical < physical_end) {
- stripe_logical += chunk_logical;
- stripe_end = stripe_logical +
- increment;
- ret = scrub_raid56_parity(sctx,
- map, scrub_dev,
- stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto loop;
- }
- } else {
- physical += map->stripe_len;
- logical += increment;
- }
- if (logical < key.objectid + bytes) {
- cond_resched();
- goto again;
- }
-
- if (physical >= physical_end) {
- stop_loop = 1;
- break;
- }
- }
+ /*
+ * Now we're at a data stripe, scrub each extents in the range.
+ *
+ * At this stage, if we ignore the repair part, inside each data
+ * stripe it is no different than SINGLE profile.
+ * We can reuse scrub_simple_mirror() here, as the repair part
+ * is still based on @mirror_num.
+ */
+ ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+ logical, map->stripe_len,
+ scrub_dev, physical, 1);
+ if (ret < 0)
+ goto out;
next:
- path->slots[0]++;
- }
- btrfs_release_path(path);
-skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
if (stop_loop)
- sctx->stat.last_physical = map->stripes[stripe_index].physical +
- dev_extent_len;
+ sctx->stat.last_physical =
+ map->stripes[stripe_index].physical + dev_stripe_len;
else
sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
@@ -3566,7 +3758,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
* kthread or relocation.
*/
spin_lock(&bg->lock);
- if (!bg->removed)
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
ret = -EINVAL;
spin_unlock(&bg->lock);
@@ -3581,8 +3773,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
- dev_extent_len);
+ ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
if (ret)
goto out;
}
@@ -3699,14 +3890,37 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ ASSERT(cache->start <= chunk_offset);
+ /*
+ * We are using the commit root to search for device extents, so
+ * that means we could have found a device extent item from a
+ * block group that was deleted in the current transaction. The
+ * logical start offset of the deleted block group, stored at
+ * @chunk_offset, might be part of the logical address range of
+ * a new block group (which uses different physical extents).
+ * In this case btrfs_lookup_block_group() has returned the new
+ * block group, and its start address is less than @chunk_offset.
+ *
+ * We skip such new block groups, because it's pointless to
+ * process them, as we won't find their extents because we search
+ * for them using the commit root of the extent tree. For a device
+ * replace it's also fine to skip it, we won't miss copying them
+ * to the target device because we have the write duplication
+ * setup through the regular write path (by btrfs_map_block()),
+ * and we have committed a transaction when we started the device
+ * replace, right after setting up the device replace state.
+ */
+ if (cache->start < chunk_offset) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
- spin_lock(&cache->lock);
- if (!cache->to_copy) {
+ if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
}
- spin_unlock(&cache->lock);
}
/*
@@ -3718,7 +3932,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* repair extents.
*/
spin_lock(&cache->lock);
- if (cache->removed) {
+ if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
@@ -3822,7 +4036,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
- ASSERT(cache->start == chunk_offset);
ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
dev_extent_len);
@@ -3879,8 +4092,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* balance is triggered or it becomes used and unused again.
*/
spin_lock(&cache->lock);
- if (!cache->removed && !cache->ro && cache->reserved == 0 &&
- cache->used == 0) {
+ if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
+ !cache->ro && cache->reserved == 0 && cache->used == 0) {
spin_unlock(&cache->lock);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_queue_work(&fs_info->discard_ctl,
@@ -3940,9 +4153,9 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (!btrfs_check_super_location(scrub_dev, bytenr))
continue;
- ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, bytenr);
+ ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+ NULL, bytenr);
if (ret)
return ret;
}
@@ -3955,22 +4168,23 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
&fs_info->scrub_lock)) {
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
-
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
+ struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
+ struct workqueue_struct *scrub_wr_comp =
+ fs_info->scrub_wr_completion_workers;
+ struct workqueue_struct *scrub_parity =
+ fs_info->scrub_parity_workers;
fs_info->scrub_workers = NULL;
fs_info->scrub_wr_completion_workers = NULL;
fs_info->scrub_parity_workers = NULL;
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ if (scrub_workers)
+ destroy_workqueue(scrub_workers);
+ if (scrub_wr_comp)
+ destroy_workqueue(scrub_wr_comp);
+ if (scrub_parity)
+ destroy_workqueue(scrub_parity);
}
}
@@ -3980,9 +4194,9 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
+ struct workqueue_struct *scrub_workers = NULL;
+ struct workqueue_struct *scrub_wr_comp = NULL;
+ struct workqueue_struct *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
int ret = -ENOMEM;
@@ -3990,18 +4204,16 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
return 0;
- scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
- is_dev_replace ? 1 : max_active, 4);
+ scrub_workers = alloc_workqueue("btrfs-scrub", flags,
+ is_dev_replace ? 1 : max_active);
if (!scrub_workers)
goto fail_scrub_workers;
- scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
+ scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
if (!scrub_wr_comp)
goto fail_scrub_wr_completion_workers;
- scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
- max_active, 2);
+ scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
if (!scrub_parity)
goto fail_scrub_parity_workers;
@@ -4022,11 +4234,11 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->scrub_lock);
ret = 0;
- btrfs_destroy_workqueue(scrub_parity);
+ destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(scrub_wr_comp);
+ destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(scrub_workers);
+ destroy_workqueue(scrub_workers);
fail_scrub_workers:
return ret;
}
@@ -4040,38 +4252,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
- if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
- /*
- * in this case scrub is unable to calculate the checksum
- * the way scrub is implemented. Do not handle this
- * situation at all because it won't ever happen.
- */
- btrfs_err(fs_info,
- "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
- fs_info->nodesize,
- BTRFS_STRIPE_LEN);
- return -EINVAL;
- }
+ /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+ ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
- if (fs_info->nodesize >
- PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
- fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
- /*
- * would exhaust the array bounds of pagev member in
- * struct scrub_block
- */
- btrfs_err(fs_info,
- "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
- fs_info->nodesize,
- SCRUB_MAX_PAGES_PER_BLOCK,
- fs_info->sectorsize,
- SCRUB_MAX_PAGES_PER_BLOCK);
- return -EINVAL;
- }
+ /*
+ * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
+ * value (max nodesize / min sectorsize), thus nodesize should always
+ * be fine.
+ */
+ ASSERT(fs_info->nodesize <=
+ SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
/* Allocate outside of device_list_mutex */
sctx = scrub_setup_ctx(fs_info, is_dev_replace);
@@ -4137,7 +4332,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
/*
* In order to avoid deadlock with reclaim when there is a transaction
* trying to pause scrub, make sure we use GFP_NOFS for all the
- * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
* invoked by our callees. The pausing request is done when the
* transaction commit starts, and it blocks the transaction until scrub
* is paused (done at specific points at scrub_stripe() or right above
@@ -4145,6 +4340,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -4153,6 +4354,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -4179,6 +4390,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
out:
scrub_workers_put(fs_info);
@@ -4271,11 +4501,11 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
-static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u32 extent_len,
- u64 *extent_physical,
- struct btrfs_device **extent_dev,
- int *extent_mirror_num)
+static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+ u64 extent_logical, u32 extent_len,
+ u64 *extent_physical,
+ struct btrfs_device **extent_dev,
+ int *extent_mirror_num)
{
u64 mapped_length;
struct btrfs_io_context *bioc = NULL;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7d1642937274..4ef4167072b8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -15,8 +15,10 @@
#include <linux/string.h>
#include <linux/compat.h>
#include <linux/crc32c.h>
+#include <linux/fsverity.h>
#include "send.h"
+#include "ctree.h"
#include "backref.h"
#include "locking.h"
#include "disk-io.h"
@@ -82,8 +84,12 @@ struct send_ctx {
char *send_buf;
u32 send_size;
u32 send_max_size;
- u64 total_send_size;
- u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+ /*
+ * Whether BTRFS_SEND_A_DATA attribute was already added to current
+ * command (since protocol v2, data must be the last attribute).
+ */
+ bool put_data;
+ struct page **send_buf_pages;
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
/* Protocol version compatibility requested */
u32 proto;
@@ -113,15 +119,17 @@ struct send_ctx {
*/
u64 cur_ino;
u64 cur_inode_gen;
- int cur_inode_new;
- int cur_inode_new_gen;
- int cur_inode_deleted;
u64 cur_inode_size;
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ bool cur_inode_new;
+ bool cur_inode_new_gen;
+ bool cur_inode_deleted;
bool ignore_cur_inode;
+ bool cur_inode_needs_verity;
+ void *verity_descriptor;
u64 send_progress;
@@ -132,7 +140,14 @@ struct send_ctx {
struct list_head name_cache_list;
int name_cache_size;
+ /*
+ * The inode we are currently processing. It's not NULL only when we
+ * need to issue write commands for data extents from this inode.
+ */
+ struct inode *cur_inode;
struct file_ra_state ra;
+ u64 page_cache_clear_start;
+ bool clean_page_cache;
/*
* We process inodes by their increasing order, so if before an
@@ -228,6 +243,9 @@ struct send_ctx {
* Indexed by the inode number of the directory to be deleted.
*/
struct rb_root orphan_dirs;
+
+ struct rb_root rbtree_new_refs;
+ struct rb_root rbtree_deleted_refs;
};
struct pending_dir_move {
@@ -328,8 +346,8 @@ __maybe_unused
static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
{
switch (sctx->proto) {
- case 1: return cmd < __BTRFS_SEND_C_MAX_V1;
- case 2: return cmd < __BTRFS_SEND_C_MAX_V2;
+ case 1: return cmd <= BTRFS_SEND_C_MAX_V1;
+ case 2: return cmd <= BTRFS_SEND_C_MAX_V2;
default: return false;
}
}
@@ -570,15 +588,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
while (pos < len) {
ret = kernel_write(filp, buf + pos, len - pos, off);
- /* TODO handle that correctly */
- /*if (ret == -ERESTARTSYS) {
- continue;
- }*/
if (ret < 0)
return ret;
- if (ret == 0) {
+ if (ret == 0)
return -EIO;
- }
pos += ret;
}
@@ -591,6 +604,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
int total_len = sizeof(*hdr) + len;
int left = sctx->send_max_size - sctx->send_size;
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+
if (unlikely(left < total_len))
return -EOVERFLOW;
@@ -611,6 +627,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
}
+TLV_PUT_DEFINE_INT(8)
+TLV_PUT_DEFINE_INT(32)
TLV_PUT_DEFINE_INT(64)
static int tlv_put_string(struct send_ctx *sctx, u16 attr,
@@ -686,8 +704,7 @@ static int send_header(struct send_ctx *sctx)
struct btrfs_stream_header hdr;
strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
- hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
-
+ hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
}
@@ -727,9 +744,8 @@ static int send_cmd(struct send_ctx *sctx)
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
- sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
+ sctx->put_data = false;
return ret;
}
@@ -830,17 +846,32 @@ out:
return ret;
}
+struct btrfs_inode_info {
+ u64 size;
+ u64 gen;
+ u64 mode;
+ u64 uid;
+ u64 gid;
+ u64 rdev;
+ u64 fileattr;
+ u64 nlink;
+};
+
/*
* Helper function to retrieve some fields from an inode item.
*/
-static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
- u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
- u64 *gid, u64 *rdev)
+static int get_inode_info(struct btrfs_root *root, u64 ino,
+ struct btrfs_inode_info *info)
{
int ret;
+ struct btrfs_path *path;
struct btrfs_inode_item *ii;
struct btrfs_key key;
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
@@ -848,41 +879,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- return ret;
+ goto out;
}
+ if (!info)
+ goto out;
+
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- if (size)
- *size = btrfs_inode_size(path->nodes[0], ii);
- if (gen)
- *gen = btrfs_inode_generation(path->nodes[0], ii);
- if (mode)
- *mode = btrfs_inode_mode(path->nodes[0], ii);
- if (uid)
- *uid = btrfs_inode_uid(path->nodes[0], ii);
- if (gid)
- *gid = btrfs_inode_gid(path->nodes[0], ii);
- if (rdev)
- *rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->size = btrfs_inode_size(path->nodes[0], ii);
+ info->gen = btrfs_inode_generation(path->nodes[0], ii);
+ info->mode = btrfs_inode_mode(path->nodes[0], ii);
+ info->uid = btrfs_inode_uid(path->nodes[0], ii);
+ info->gid = btrfs_inode_gid(path->nodes[0], ii);
+ info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
+ info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
+ /*
+ * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
+ * otherwise logically split to 32/32 parts.
+ */
+ info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
+out:
+ btrfs_free_path(path);
return ret;
}
-static int get_inode_info(struct btrfs_root *root,
- u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev)
+static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
{
- struct btrfs_path *path;
int ret;
+ struct btrfs_inode_info info;
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
- ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
- rdev);
- btrfs_free_path(path);
+ if (!gen)
+ return -EPERM;
+
+ ret = get_inode_info(root, ino, &info);
+ if (!ret)
+ *gen = info.gen;
return ret;
}
@@ -1625,21 +1658,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
int right_ret;
u64 left_gen;
u64 right_gen;
+ struct btrfs_inode_info info;
- ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- left_ret = ret;
+ left_ret = (info.nlink == 0) ? -ENOENT : ret;
+ left_gen = info.gen;
if (!sctx->parent_root) {
right_ret = -ENOENT;
} else {
- ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
goto out;
- right_ret = ret;
+ right_ret = (info.nlink == 0) ? -ENOENT : ret;
+ right_gen = info.gen;
}
if (!left_ret && !right_ret) {
@@ -1798,8 +1832,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
btrfs_release_path(path);
if (dir_gen) {
- ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent_dir, dir_gen);
if (ret < 0)
goto out;
}
@@ -1856,6 +1889,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
int ret = 0;
u64 gen;
u64 other_inode = 0;
+ struct btrfs_inode_info info;
if (!sctx->parent_root)
goto out;
@@ -1870,8 +1904,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
* and we can just unlink this entry.
*/
if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1898,13 +1931,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
*/
if (other_inode > sctx->send_progress ||
is_waiting_for_move(sctx, other_inode)) {
- ret = get_inode_info(sctx->parent_root, other_inode, NULL,
- who_gen, who_mode, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->parent_root, other_inode, &info);
if (ret < 0)
goto out;
ret = 1;
*who_ino = other_inode;
+ *who_gen = info.gen;
+ *who_mode = info.mode;
} else {
ret = 0;
}
@@ -1937,8 +1971,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
if (dir != BTRFS_FIRST_FREE_OBJECTID) {
- ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &gen);
if (ret < 0 && ret != -ENOENT)
goto out;
if (ret) {
@@ -1960,8 +1993,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
if (ret < 0)
goto out;
@@ -2177,7 +2209,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
/*
* If the inode is not existent yet, add the orphan name and return 1.
* This should only happen for the parent dir that we determine in
- * __record_new_ref
+ * record_new_ref_if_needed().
*/
ret = is_inode_existent(sctx, ino, gen);
if (ret < 0)
@@ -2492,6 +2524,39 @@ out:
return ret;
}
+static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
+{
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ int ret = 0;
+ struct fs_path *p;
+
+ if (sctx->proto < 2)
+ return 0;
+
+ btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, ino, gen, p);
+ if (ret < 0)
+ goto out;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(p);
+ return ret;
+}
+
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
@@ -2571,7 +2636,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
- /* TODO Add otime support when the otime patches get into upstream */
+ if (sctx->proto >= 2)
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
ret = send_cmd(sctx);
@@ -2593,6 +2659,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
int ret = 0;
struct fs_path *p;
int cmd;
+ struct btrfs_inode_info info;
u64 gen;
u64 mode;
u64 rdev;
@@ -2604,10 +2671,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
if (ino != sctx->cur_ino) {
- ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
- NULL, NULL, &rdev);
+ ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0)
goto out;
+ gen = info.gen;
+ mode = info.mode;
+ rdev = info.rdev;
} else {
gen = sctx->cur_inode_gen;
mode = sctx->cur_inode_mode;
@@ -2675,61 +2744,43 @@ out:
static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
- struct extent_buffer *eb;
struct btrfs_dir_item *di;
- int slot;
path = alloc_path_for_send();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->send_root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
+ btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *eb = path->nodes[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
- di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+ di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
di_key.objectid < sctx->send_progress) {
ret = 1;
- goto out;
+ break;
}
-
- path->slots[0]++;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -2762,48 +2813,50 @@ struct recorded_ref {
u64 dir;
u64 dir_gen;
int name_len;
+ struct rb_node node;
+ struct rb_root *root;
};
-static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+static struct recorded_ref *recorded_ref_alloc(void)
{
- ref->full_path = path;
- ref->name = (char *)kbasename(ref->full_path->start);
- ref->name_len = ref->full_path->end - ref->name;
+ struct recorded_ref *ref;
+
+ ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+ if (!ref)
+ return NULL;
+ RB_CLEAR_NODE(&ref->node);
+ INIT_LIST_HEAD(&ref->list);
+ return ref;
}
-/*
- * We need to process new refs before deleted refs, but compare_tree gives us
- * everything mixed. So we first record all refs and later process them.
- * This function is a helper to record one ref.
- */
-static int __record_ref(struct list_head *head, u64 dir,
- u64 dir_gen, struct fs_path *path)
+static void recorded_ref_free(struct recorded_ref *ref)
{
- struct recorded_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_KERNEL);
if (!ref)
- return -ENOMEM;
+ return;
+ if (!RB_EMPTY_NODE(&ref->node))
+ rb_erase(&ref->node, ref->root);
+ list_del(&ref->list);
+ fs_path_free(ref->full_path);
+ kfree(ref);
+}
- ref->dir = dir;
- ref->dir_gen = dir_gen;
- set_ref_path(ref, path);
- list_add_tail(&ref->list, head);
- return 0;
+static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
+{
+ ref->full_path = path;
+ ref->name = (char *)kbasename(ref->full_path->start);
+ ref->name_len = ref->full_path->end - ref->name;
}
static int dup_ref(struct recorded_ref *ref, struct list_head *list)
{
struct recorded_ref *new;
- new = kmalloc(sizeof(*ref), GFP_KERNEL);
+ new = recorded_ref_alloc();
if (!new)
return -ENOMEM;
new->dir = ref->dir;
new->dir_gen = ref->dir_gen;
- new->full_path = NULL;
- INIT_LIST_HEAD(&new->list);
list_add_tail(&new->list, list);
return 0;
}
@@ -2814,9 +2867,7 @@ static void __free_recorded_refs(struct list_head *head)
while (!list_empty(head)) {
cur = list_entry(head->next, struct recorded_ref, list);
- fs_path_free(cur->full_path);
- list_del(&cur->list);
- kfree(cur);
+ recorded_ref_free(cur);
}
}
@@ -2933,6 +2984,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
u64 send_progress)
{
int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root = sctx->parent_root;
struct btrfs_path *path;
struct btrfs_key key;
@@ -2959,23 +3011,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (odi)
key.offset = odi->last_dir_index_offset;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct waiting_dir_move *dm;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
if (found_key.objectid != key.objectid ||
found_key.type != key.type)
break;
@@ -3010,8 +3048,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
ret = 0;
goto out;
}
-
- path->slots[0]++;
+ }
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
free_orphan_dir_info(sctx, odi);
@@ -3336,8 +3376,7 @@ finish:
/*
* The parent inode might have been deleted in the send snapshot
*/
- ret = get_inode_info(sctx->send_root, cur->dir, NULL,
- NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(sctx->send_root, cur->dir, NULL);
if (ret == -ENOENT) {
ret = 0;
continue;
@@ -3511,12 +3550,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
- &left_gen, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
if (ret < 0)
goto out;
- ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
- &right_gen, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
@@ -3579,7 +3616,7 @@ static int check_ino_in_path(struct btrfs_root *root,
}
/*
- * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
+ * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
* possible path (in case ino2 is not a directory and has multiple hard links).
* Return 1 if true, 0 if false and < 0 on error.
*/
@@ -3591,6 +3628,7 @@ static int is_ancestor(struct btrfs_root *root,
{
bool free_fs_path = false;
int ret = 0;
+ int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
@@ -3611,26 +3649,12 @@ static int is_ancestor(struct btrfs_root *root,
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (true) {
+ btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u32 cur_offset = 0;
u32 item_size;
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino2)
break;
if (key.type != BTRFS_INODE_REF_KEY &&
@@ -3659,8 +3683,7 @@ static int is_ancestor(struct btrfs_root *root,
cur_offset = item_size;
}
- ret = get_inode_info(root, parent, NULL, &parent_gen,
- NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(root, parent, &parent_gen);
if (ret < 0)
goto out;
ret = check_ino_in_path(root, ino1, ino1_gen,
@@ -3668,10 +3691,12 @@ static int is_ancestor(struct btrfs_root *root,
if (ret)
goto out;
}
- path->slots[0]++;
}
ret = 0;
- out:
+ if (iter_ret < 0)
+ ret = iter_ret;
+
+out:
btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
@@ -3748,9 +3773,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
memcmp(path_before->start, path_after->start, len1))) {
u64 parent_ino_gen;
- ret = get_inode_info(sctx->parent_root, ino, NULL,
- &parent_ino_gen, NULL, NULL, NULL,
- NULL);
+ ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
if (ret < 0)
goto out;
if (ino_gen == parent_ino_gen) {
@@ -4344,185 +4367,167 @@ out:
return ret;
}
-static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
- void *ctx, struct list_head *refs)
+static int rbtree_ref_comp(const void *k, const struct rb_node *node)
+{
+ const struct recorded_ref *data = k;
+ const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
+ int result;
+
+ if (data->dir > ref->dir)
+ return 1;
+ if (data->dir < ref->dir)
+ return -1;
+ if (data->dir_gen > ref->dir_gen)
+ return 1;
+ if (data->dir_gen < ref->dir_gen)
+ return -1;
+ if (data->name_len > ref->name_len)
+ return 1;
+ if (data->name_len < ref->name_len)
+ return -1;
+ result = strcmp(data->name, ref->name);
+ if (result > 0)
+ return 1;
+ if (result < 0)
+ return -1;
+ return 0;
+}
+
+static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
+{
+ const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);
+
+ return rbtree_ref_comp(entry, parent) < 0;
+}
+
+static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
+ struct fs_path *name, u64 dir, u64 dir_gen,
+ struct send_ctx *sctx)
{
int ret = 0;
- struct send_ctx *sctx = ctx;
- struct fs_path *p;
- u64 gen;
+ struct fs_path *path = NULL;
+ struct recorded_ref *ref = NULL;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ path = fs_path_alloc();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
- ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
+ ref = recorded_ref_alloc();
+ if (!ref) {
+ ret = -ENOMEM;
goto out;
+ }
- ret = get_cur_path(sctx, dir, gen, p);
+ ret = get_cur_path(sctx, dir, dir_gen, path);
if (ret < 0)
goto out;
- ret = fs_path_add_path(p, name);
+ ret = fs_path_add_path(path, name);
if (ret < 0)
goto out;
- ret = __record_ref(refs, dir, gen, p);
-
+ ref->dir = dir;
+ ref->dir_gen = dir_gen;
+ set_ref_path(ref, path);
+ list_add_tail(&ref->list, refs);
+ rb_add(&ref->node, root, rbtree_ref_less);
+ ref->root = root;
out:
- if (ret)
- fs_path_free(p);
+ if (ret) {
+ if (path && (!ref || !ref->full_path))
+ fs_path_free(path);
+ recorded_ref_free(ref);
+ }
return ret;
}
-static int __record_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
-{
- struct send_ctx *sctx = ctx;
- return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
-}
-
-
-static int __record_deleted_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_new_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
{
+ int ret = 0;
struct send_ctx *sctx = ctx;
- return record_ref(sctx->parent_root, dir, name, ctx,
- &sctx->deleted_refs);
-}
-
-static int record_new_ref(struct send_ctx *sctx)
-{
- int ret;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
- ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_new_ref, sctx);
+ ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
goto out;
- ret = 0;
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_new_refs,
+ &sctx->new_refs, name, dir, dir_gen,
+ sctx);
+ }
out:
return ret;
}
-static int record_deleted_ref(struct send_ctx *sctx)
+static int record_deleted_ref_if_needed(int num, u64 dir, int index,
+ struct fs_path *name, void *ctx)
{
- int ret;
+ int ret = 0;
+ struct send_ctx *sctx = ctx;
+ struct rb_node *node = NULL;
+ struct recorded_ref data;
+ struct recorded_ref *ref;
+ u64 dir_gen;
- ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_deleted_ref, sctx);
+ ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
goto out;
- ret = 0;
+ data.dir = dir;
+ data.dir_gen = dir_gen;
+ set_ref_path(&data, name);
+ node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp);
+ if (node) {
+ ref = rb_entry(node, struct recorded_ref, node);
+ recorded_ref_free(ref);
+ } else {
+ ret = record_ref_in_tree(&sctx->rbtree_deleted_refs,
+ &sctx->deleted_refs, name, dir,
+ dir_gen, sctx);
+ }
out:
return ret;
}
-struct find_ref_ctx {
- u64 dir;
- u64 dir_gen;
- struct btrfs_root *root;
- struct fs_path *name;
- int found_idx;
-};
-
-static int __find_iref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx_)
-{
- struct find_ref_ctx *ctx = ctx_;
- u64 dir_gen;
- int ret;
-
- if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
- strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
- /*
- * To avoid doing extra lookups we'll only do this if everything
- * else matches.
- */
- ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
- if (dir_gen != ctx->dir_gen)
- return 0;
- ctx->found_idx = num;
- return 1;
- }
- return 0;
-}
-
-static int find_iref(struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *key,
- u64 dir, u64 dir_gen, struct fs_path *name)
+static int record_new_ref(struct send_ctx *sctx)
{
int ret;
- struct find_ref_ctx ctx;
- ctx.dir = dir;
- ctx.name = name;
- ctx.dir_gen = dir_gen;
- ctx.found_idx = -1;
- ctx.root = root;
-
- ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
+ ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- return ret;
-
- if (ctx.found_idx == -1)
- return -ENOENT;
-
- return ctx.found_idx;
-}
-
-static int __record_changed_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
-{
- u64 dir_gen;
- int ret;
- struct send_ctx *sctx = ctx;
-
- ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
-
- ret = find_iref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, dir, dir_gen, name);
- if (ret == -ENOENT)
- ret = __record_new_ref(num, dir, index, name, sctx);
- else if (ret > 0)
- ret = 0;
+ goto out;
+ ret = 0;
+out:
return ret;
}
-static int __record_changed_deleted_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_deleted_ref(struct send_ctx *sctx)
{
- u64 dir_gen;
int ret;
- struct send_ctx *sctx = ctx;
-
- ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
- NULL, NULL, NULL);
- if (ret)
- return ret;
- ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
- dir, dir_gen, name);
- if (ret == -ENOENT)
- ret = __record_deleted_ref(num, dir, index, name, sctx);
- else if (ret > 0)
- ret = 0;
+ ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+ sctx->cmp_key, 0, record_deleted_ref_if_needed,
+ sctx);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+out:
return ret;
}
@@ -4531,11 +4536,11 @@ static int record_changed_ref(struct send_ctx *sctx)
int ret = 0;
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
- sctx->cmp_key, 0, __record_changed_new_ref, sctx);
+ sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
goto out;
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
- sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
+ sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
if (ret < 0)
goto out;
ret = 0;
@@ -4551,13 +4556,12 @@ out:
static int process_all_refs(struct send_ctx *sctx,
enum btrfs_compare_tree_result cmd)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
iterate_inode_ref_t cb;
int pending_move = 0;
@@ -4567,10 +4571,10 @@ static int process_all_refs(struct send_ctx *sctx,
if (cmd == BTRFS_COMPARE_TREE_NEW) {
root = sctx->send_root;
- cb = __record_new_ref;
+ cb = record_new_ref_if_needed;
} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
root = sctx->parent_root;
- cb = __record_deleted_ref;
+ cb = record_deleted_ref_if_needed;
} else {
btrfs_err(sctx->send_root->fs_info,
"Wrong command %d in process_all_refs", cmd);
@@ -4581,24 +4585,7 @@ static int process_all_refs(struct send_ctx *sctx,
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
(found_key.type != BTRFS_INODE_REF_KEY &&
found_key.type != BTRFS_INODE_EXTREF_KEY))
@@ -4607,8 +4594,11 @@ static int process_all_refs(struct send_ctx *sctx,
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
if (ret < 0)
goto out;
-
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto out;
}
btrfs_release_path(path);
@@ -4870,13 +4860,12 @@ out:
static int process_all_new_xattrs(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
path = alloc_path_for_send();
if (!path)
@@ -4887,43 +4876,103 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
+static int send_verity(struct send_ctx *sctx, struct fs_path *path,
+ struct fsverity_descriptor *desc)
+{
+ int ret;
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
+ if (ret < 0)
+ goto out;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
+ le8_to_cpu(desc->hash_algorithm));
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
+ 1U << le8_to_cpu(desc->log_blocksize));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
+ le8_to_cpu(desc->salt_size));
+ TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
+ le32_to_cpu(desc->sig_size));
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ return ret;
+}
+
+static int process_verity(struct send_ctx *sctx)
+{
+ int ret = 0;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+ struct inode *inode;
+ struct fs_path *p;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ if (ret < 0)
+ goto iput;
+
+ if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+ ret = -EMSGSIZE;
+ goto iput;
+ }
+ if (!sctx->verity_descriptor) {
+ sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
+ GFP_KERNEL);
+ if (!sctx->verity_descriptor) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ }
+
+ ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ if (ret < 0)
+ goto iput;
+
+ p = fs_path_alloc();
+ if (!p) {
+ ret = -ENOMEM;
+ goto iput;
+ }
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto free_path;
+
+ ret = send_verity(sctx, p, sctx->verity_descriptor);
+ if (ret < 0)
+ goto free_path;
+
+free_path:
+ fs_path_free(p);
+iput:
+ iput(inode);
+ return ret;
+}
+
static inline u64 max_send_read_size(const struct send_ctx *sctx)
{
return sctx->send_max_size - SZ_16K;
@@ -4931,14 +4980,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx)
static int put_data_header(struct send_ctx *sctx, u32 len)
{
- struct btrfs_tlv_header *hdr;
+ if (WARN_ON_ONCE(sctx->put_data))
+ return -EINVAL;
+ sctx->put_data = true;
+ if (sctx->proto >= 2) {
+ /*
+ * Since v2, the data attribute header doesn't include a length,
+ * it is implicitly to the end of the command.
+ */
+ if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
+ return -EOVERFLOW;
+ put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
+ sctx->send_size += sizeof(__le16);
+ } else {
+ struct btrfs_tlv_header *hdr;
- if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
- return -EOVERFLOW;
- hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
- put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
- put_unaligned_le16(len, &hdr->tlv_len);
- sctx->send_size += sizeof(*hdr);
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ }
return 0;
}
@@ -4946,7 +5009,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
struct page *page;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
@@ -4957,40 +5019,33 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (ret)
return ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
- /* initial readahead */
- memset(&sctx->ra, 0, sizeof(struct file_ra_state));
- file_ra_state_init(&sctx->ra, inode->i_mapping);
-
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
- page = find_lock_page(inode->i_mapping, index);
+ page = find_lock_page(sctx->cur_inode->i_mapping, index);
if (!page) {
- page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
- NULL, index, last_index + 1 - index);
+ page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, index,
+ last_index + 1 - index);
- page = find_or_create_page(inode->i_mapping, index,
- GFP_KERNEL);
+ page = find_or_create_page(sctx->cur_inode->i_mapping,
+ index, GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
break;
}
}
- if (PageReadahead(page)) {
- page_cache_async_readahead(inode->i_mapping, &sctx->ra,
- NULL, page, index, last_index + 1 - index);
- }
+ if (PageReadahead(page))
+ page_cache_async_readahead(sctx->cur_inode->i_mapping,
+ &sctx->ra, NULL, page_folio(page),
+ index, last_index + 1 - index);
if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
+ btrfs_read_folio(NULL, page_folio(page));
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
@@ -5013,7 +5068,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
len -= cur_len;
sctx->send_size += cur_len;
}
- iput(inode);
+
return ret;
}
@@ -5088,8 +5143,7 @@ static int send_clone(struct send_ctx *sctx,
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
if (clone_root->root == sctx->send_root) {
- ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
- &gen, NULL, NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
if (ret < 0)
goto out;
ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -5216,16 +5270,250 @@ tlv_put_failure:
return ret;
}
-static int send_extent_data(struct send_ctx *sctx,
- const u64 offset,
- const u64 len)
+static int send_encoded_inline_extent(struct send_ctx *sctx,
+ struct btrfs_path *path, u64 offset,
+ u64 len)
{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 ram_bytes;
+ size_t inline_size;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
+ inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + ram_bytes - offset, len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+
+ ret = put_data_header(sctx, inline_size);
+ if (ret < 0)
+ goto out;
+ read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
+ btrfs_file_extent_inline_start(ei), inline_size);
+ sctx->send_size += inline_size;
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
+ u64 offset, u64 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct fs_path *fspath;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key;
+ struct btrfs_file_extent_item *ei;
+ u64 disk_bytenr, disk_num_bytes;
+ u32 data_offset;
+ struct btrfs_cmd_header *hdr;
+ u32 crc;
+ int ret;
+
+ inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ fspath = fs_path_alloc();
+ if (!fspath) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
+ min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
+ len));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
+ btrfs_file_extent_ram_bytes(leaf, ei));
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
+ offset - key.offset + btrfs_file_extent_offset(leaf, ei));
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, ei));
+ if (ret < 0)
+ goto out;
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);
+
+ ret = put_data_header(sctx, disk_num_bytes);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * We want to do I/O directly into the send buffer, so get the next page
+ * boundary in the send buffer. This means that there may be a gap
+ * between the beginning of the command and the file data.
+ */
+ data_offset = ALIGN(sctx->send_size, PAGE_SIZE);
+ if (data_offset > sctx->send_max_size ||
+ sctx->send_max_size - data_offset < disk_num_bytes) {
+ ret = -EOVERFLOW;
+ goto out;
+ }
+
+ /*
+ * Note that send_buf is a mapping of send_buf_pages, so this is really
+ * reading into send_buf.
+ */
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ disk_bytenr, disk_num_bytes,
+ sctx->send_buf_pages +
+ (data_offset >> PAGE_SHIFT));
+ if (ret)
+ goto out;
+
+ hdr = (struct btrfs_cmd_header *)sctx->send_buf;
+ hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
+ hdr->crc = 0;
+ crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
+ hdr->crc = cpu_to_le32(crc);
+
+ ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+ &sctx->send_off);
+ if (!ret) {
+ ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
+ disk_num_bytes, &sctx->send_off);
+ }
+ sctx->send_size = 0;
+ sctx->put_data = false;
+
+tlv_put_failure:
+out:
+ fs_path_free(fspath);
+ iput(inode);
+ return ret;
+}
+
+static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
+ const u64 offset, const u64 len)
+{
+ const u64 end = offset + len;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
+ btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
+ bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_INLINE);
+
+ /*
+ * Send the compressed extent unless the compressed data is
+ * larger than the decompressed data. This can happen if we're
+ * not sending the entire extent, either because it has been
+ * partially overwritten/truncated or because this is a part of
+ * the extent that we couldn't clone in clone_range().
+ */
+ if (is_inline &&
+ btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]) <= len) {
+ return send_encoded_inline_extent(sctx, path, offset,
+ len);
+ } else if (!is_inline &&
+ btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
+ return send_encoded_extent(sctx, path, offset, len);
+ }
+ }
+
+ if (sctx->cur_inode == NULL) {
+ struct btrfs_root *root = sctx->send_root;
+
+ sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ if (IS_ERR(sctx->cur_inode)) {
+ int err = PTR_ERR(sctx->cur_inode);
+
+ sctx->cur_inode = NULL;
+ return err;
+ }
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+ /*
+ * It's very likely there are no pages from this inode in the page
+ * cache, so after reading extents and sending their data, we clean
+ * the page cache to avoid trashing the page cache (adding pressure
+ * to the page cache and forcing eviction of other data more useful
+ * for applications).
+ *
+ * We decide if we should clean the page cache simply by checking
+ * if the inode's mapping nrpages is 0 when we first open it, and
+ * not by using something like filemap_range_has_page() before
+ * reading an extent because when we ask the readahead code to
+ * read a given file range, it may (and almost always does) read
+ * pages from beyond that range (see the documentation for
+ * page_cache_sync_readahead()), so it would not be reliable,
+ * because after reading the first extent future calls to
+ * filemap_range_has_page() would return true because the readahead
+ * on the previous extent resulted in reading pages of the current
+ * extent as well.
+ */
+ sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+ sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
+ }
+
while (sent < len) {
u64 size = min(len - sent, read_size);
int ret;
@@ -5235,6 +5523,37 @@ static int send_extent_data(struct send_ctx *sctx,
return ret;
sent += size;
}
+
+ if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+ /*
+ * Always operate only on ranges that are a multiple of the page
+ * size. This is not only to prevent zeroing parts of a page in
+ * the case of subpage sector size, but also to guarantee we evict
+ * pages, as passing a range that is smaller than page size does
+ * not evict the respective page (only zeroes part of its content).
+ *
+ * Always start from the end offset of the last range cleared.
+ * This is because the readahead code may (and very often does)
+ * reads pages beyond the range we request for readahead. So if
+ * we have an extent layout like this:
+ *
+ * [ extent A ] [ extent B ] [ extent C ]
+ *
+ * When we ask page_cache_sync_readahead() to read extent A, it
+ * may also trigger reads for pages of extent B. If we are doing
+ * an incremental send and extent B has not changed between the
+ * parent and send snapshots, some or all of its pages may end
+ * up being read and placed in the page cache. So when truncating
+ * the page cache we always start from the end offset of the
+ * previously processed extent up to the end of the current
+ * extent.
+ */
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ end - 1);
+ sctx->page_cache_clear_start = end;
+ }
+
return 0;
}
@@ -5296,16 +5615,14 @@ out:
return ret;
}
-static int clone_range(struct send_ctx *sctx,
- struct clone_root *clone_root,
- const u64 disk_byte,
- u64 data_offset,
- u64 offset,
- u64 len)
+static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
+ struct clone_root *clone_root, const u64 disk_byte,
+ u64 data_offset, u64 offset, u64 len)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret;
+ struct btrfs_inode_info info;
u64 clone_src_i_size = 0;
/*
@@ -5325,7 +5642,7 @@ static int clone_range(struct send_ctx *sctx,
*/
if (clone_root->offset == 0 &&
len == sctx->send_root->fs_info->sectorsize)
- return send_extent_data(sctx, offset, len);
+ return send_extent_data(sctx, dst_path, offset, len);
path = alloc_path_for_send();
if (!path)
@@ -5335,11 +5652,11 @@ static int clone_range(struct send_ctx *sctx,
* There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = __get_inode_info(clone_root->root, path, clone_root->ino,
- &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
+ ret = get_inode_info(clone_root->root, clone_root->ino, &info);
btrfs_release_path(path);
if (ret < 0)
goto out;
+ clone_src_i_size = info.size;
/*
* We can't send a clone operation for the entire range if we find
@@ -5422,7 +5739,8 @@ static int clone_range(struct send_ctx *sctx,
if (hole_len > len)
hole_len = len;
- ret = send_extent_data(sctx, offset, hole_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ hole_len);
if (ret < 0)
goto out;
@@ -5495,14 +5813,16 @@ static int clone_range(struct send_ctx *sctx,
if (ret < 0)
goto out;
}
- ret = send_extent_data(sctx, offset + slen,
+ ret = send_extent_data(sctx, dst_path,
+ offset + slen,
clone_len - slen);
} else {
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
} else {
- ret = send_extent_data(sctx, offset, clone_len);
+ ret = send_extent_data(sctx, dst_path, offset,
+ clone_len);
}
if (ret < 0)
@@ -5534,7 +5854,7 @@ next:
}
if (len > 0)
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, dst_path, offset, len);
else
ret = 0;
out:
@@ -5565,10 +5885,10 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
- ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, end - offset);
+ ret = clone_range(sctx, path, clone_root, disk_byte,
+ data_offset, offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, end - offset);
+ ret = send_extent_data(sctx, path, offset, end - offset);
}
sctx->cur_inode_next_write_offset = end;
return ret;
@@ -5965,13 +6285,12 @@ out:
static int process_all_extents(struct send_ctx *sctx)
{
- int ret;
+ int ret = 0;
+ int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
- struct extent_buffer *eb;
- int slot;
root = sctx->send_root;
path = alloc_path_for_send();
@@ -5981,41 +6300,21 @@ static int process_all_extents(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- eb = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &found_key, slot);
-
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
- goto out;
+ break;
}
ret = process_extent(sctx, path, &found_key);
if (ret < 0)
- goto out;
-
- path->slots[0]++;
+ break;
}
+ /* Catch error found during iteration */
+ if (iter_ret < 0)
+ ret = iter_ret;
-out:
btrfs_free_path(path);
return ret;
}
@@ -6046,14 +6345,18 @@ out:
static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
{
int ret = 0;
+ struct btrfs_inode_info info;
u64 left_mode;
u64 left_uid;
u64 left_gid;
+ u64 left_fileattr;
u64 right_mode;
u64 right_uid;
u64 right_gid;
+ u64 right_fileattr;
int need_chmod = 0;
int need_chown = 0;
+ bool need_fileattr = false;
int need_truncate = 1;
int pending_move = 0;
int refs_processed = 0;
@@ -6085,11 +6388,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
goto out;
-
- ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
- &left_mode, &left_uid, &left_gid, NULL);
+ ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ left_mode = info.mode;
+ left_uid = info.uid;
+ left_gid = info.gid;
+ left_fileattr = info.fileattr;
if (!sctx->parent_root || sctx->cur_inode_new) {
need_chown = 1;
@@ -6100,16 +6405,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
} else {
u64 old_size;
- ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
- &old_size, NULL, &right_mode, &right_uid,
- &right_gid, NULL);
+ ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
if (ret < 0)
goto out;
+ old_size = info.size;
+ right_mode = info.mode;
+ right_uid = info.uid;
+ right_gid = info.gid;
+ right_fileattr = info.fileattr;
if (left_uid != right_uid || left_gid != right_gid)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
need_chmod = 1;
+ if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
+ need_fileattr = true;
if ((old_size == sctx->cur_inode_size) ||
(sctx->cur_inode_size > old_size &&
sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
@@ -6153,6 +6463,17 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret < 0)
goto out;
}
+ if (need_fileattr) {
+ ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ left_fileattr);
+ if (ret < 0)
+ goto out;
+ }
+ if (sctx->cur_inode_needs_verity) {
+ ret = process_verity(sctx);
+ if (ret < 0)
+ goto out;
+ }
ret = send_capabilities(sctx);
if (ret < 0)
@@ -6183,91 +6504,28 @@ out:
return ret;
}
-struct parent_paths_ctx {
- struct list_head *refs;
- struct send_ctx *sctx;
-};
-
-static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
- void *ctx)
+static void close_current_inode(struct send_ctx *sctx)
{
- struct parent_paths_ctx *ppctx = ctx;
+ u64 i_size;
- return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
- ppctx->refs);
-}
-
-/*
- * Issue unlink operations for all paths of the current inode found in the
- * parent snapshot.
- */
-static int btrfs_unlink_all_paths(struct send_ctx *sctx)
-{
- LIST_HEAD(deleted_refs);
- struct btrfs_path *path;
- struct btrfs_key key;
- struct parent_paths_ctx ctx;
- int ret;
-
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- ctx.refs = &deleted_refs;
- ctx.sctx = sctx;
-
- while (true) {
- struct extent_buffer *eb = path->nodes[0];
- int slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(sctx->parent_root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(eb, &key, slot);
- if (key.objectid != sctx->cur_ino)
- break;
- if (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)
- break;
-
- ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
- record_parent_ref, &ctx);
- if (ret < 0)
- goto out;
+ if (sctx->cur_inode == NULL)
+ return;
- path->slots[0]++;
- }
+ i_size = i_size_read(sctx->cur_inode);
- while (!list_empty(&deleted_refs)) {
- struct recorded_ref *ref;
+ /*
+ * If we are doing an incremental send, we may have extents between the
+ * last processed extent and the i_size that have not been processed
+ * because they haven't changed but we may have read some of their pages
+ * through readahead, see the comments at send_extent_data().
+ */
+ if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
+ truncate_inode_pages_range(&sctx->cur_inode->i_data,
+ sctx->page_cache_clear_start,
+ round_up(i_size, PAGE_SIZE) - 1);
- ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
- ret = send_unlink(sctx, ref->full_path);
- if (ret < 0)
- goto out;
- fs_path_free(ref->full_path);
- list_del(&ref->list);
- kfree(ref);
- }
- ret = 0;
-out:
- btrfs_free_path(path);
- if (ret)
- __free_recorded_refs(&deleted_refs);
- return ret;
+ iput(sctx->cur_inode);
+ sctx->cur_inode = NULL;
}
static int changed_inode(struct send_ctx *sctx,
@@ -6280,8 +6538,10 @@ static int changed_inode(struct send_ctx *sctx,
u64 left_gen = 0;
u64 right_gen = 0;
+ close_current_inode(sctx);
+
sctx->cur_ino = key->objectid;
- sctx->cur_inode_new_gen = 0;
+ sctx->cur_inode_new_gen = false;
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
@@ -6322,7 +6582,7 @@ static int changed_inode(struct send_ctx *sctx,
*/
if (left_gen != right_gen &&
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
- sctx->cur_inode_new_gen = 1;
+ sctx->cur_inode_new_gen = true;
}
/*
@@ -6334,28 +6594,39 @@ static int changed_inode(struct send_ctx *sctx,
* file descriptor against it or turning a RO snapshot into RW mode,
* keep an open file descriptor against a file, delete it and then
* turn the snapshot back to RO mode before using it for a send
- * operation. So if we find such cases, ignore the inode and all its
- * items completely if it's a new inode, or if it's a changed inode
- * make sure all its previous paths (from the parent snapshot) are all
- * unlinked and all other the inode items are ignored.
+ * operation. The former is what the receiver operation does.
+ * Therefore, if we want to send these snapshots soon after they're
+ * received, we need to handle orphan inodes as well. Moreover, orphans
+ * can appear not only in the send snapshot but also in the parent
+ * snapshot. Here are several cases:
+ *
+ * Case 1: BTRFS_COMPARE_TREE_NEW
+ * | send snapshot | action
+ * --------------------------------
+ * nlink | 0 | ignore
+ *
+ * Case 2: BTRFS_COMPARE_TREE_DELETED
+ * | parent snapshot | action
+ * ----------------------------------
+ * nlink | 0 | as usual
+ * Note: No unlinks will be sent because there're no paths for it.
+ *
+ * Case 3: BTRFS_COMPARE_TREE_CHANGED
+ * | | parent snapshot | send snapshot | action
+ * -----------------------------------------------------------------------
+ * subcase 1 | nlink | 0 | 0 | ignore
+ * subcase 2 | nlink | >0 | 0 | new_gen(deletion)
+ * subcase 3 | nlink | 0 | >0 | new_gen(creation)
+ *
*/
- if (result == BTRFS_COMPARE_TREE_NEW ||
- result == BTRFS_COMPARE_TREE_CHANGED) {
- u32 nlinks;
-
- nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
- if (nlinks == 0) {
+ if (result == BTRFS_COMPARE_TREE_NEW) {
+ if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
sctx->ignore_cur_inode = true;
- if (result == BTRFS_COMPARE_TREE_CHANGED)
- ret = btrfs_unlink_all_paths(sctx);
goto out;
}
- }
-
- if (result == BTRFS_COMPARE_TREE_NEW) {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6366,13 +6637,23 @@ static int changed_inode(struct send_ctx *sctx,
ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
sctx->cur_inode_size = btrfs_inode_size(
sctx->right_path->nodes[0], right_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->right_path->nodes[0], right_ii);
} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ u32 new_nlinks, old_nlinks;
+
+ new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+ old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
+ if (new_nlinks == 0 && old_nlinks == 0) {
+ sctx->ignore_cur_inode = true;
+ goto out;
+ } else if (new_nlinks == 0 || old_nlinks == 0) {
+ sctx->cur_inode_new_gen = 1;
+ }
/*
* We need to do some special handling in case the inode was
* reported as changed with a changed generation number. This
@@ -6385,8 +6666,8 @@ static int changed_inode(struct send_ctx *sctx,
* First, process the inode as if it was deleted.
*/
sctx->cur_inode_gen = right_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_deleted = 1;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_deleted = true;
sctx->cur_inode_size = btrfs_inode_size(
sctx->right_path->nodes[0], right_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6399,43 +6680,49 @@ static int changed_inode(struct send_ctx *sctx,
/*
* Now process the inode as if it was new.
*/
- sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 1;
- sctx->cur_inode_deleted = 0;
- sctx->cur_inode_size = btrfs_inode_size(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_mode = btrfs_inode_mode(
- sctx->left_path->nodes[0], left_ii);
- sctx->cur_inode_rdev = btrfs_inode_rdev(
- sctx->left_path->nodes[0], left_ii);
- ret = send_create_inode_if_needed(sctx);
- if (ret < 0)
- goto out;
+ if (new_nlinks > 0) {
+ sctx->cur_inode_gen = left_gen;
+ sctx->cur_inode_new = true;
+ sctx->cur_inode_deleted = false;
+ sctx->cur_inode_size = btrfs_inode_size(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_mode = btrfs_inode_mode(
+ sctx->left_path->nodes[0],
+ left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0],
+ left_ii);
+ ret = send_create_inode_if_needed(sctx);
+ if (ret < 0)
+ goto out;
- ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
- if (ret < 0)
- goto out;
- /*
- * Advance send_progress now as we did not get into
- * process_recorded_refs_if_needed in the new_gen case.
- */
- sctx->send_progress = sctx->cur_ino + 1;
+ ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+ if (ret < 0)
+ goto out;
+ /*
+ * Advance send_progress now as we did not get
+ * into process_recorded_refs_if_needed in the
+ * new_gen case.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
- /*
- * Now process all extents and xattrs of the inode as if
- * they were all new.
- */
- ret = process_all_extents(sctx);
- if (ret < 0)
- goto out;
- ret = process_all_new_xattrs(sctx);
- if (ret < 0)
- goto out;
+ /*
+ * Now process all extents and xattrs of the
+ * inode as if they were all new.
+ */
+ ret = process_all_extents(sctx);
+ if (ret < 0)
+ goto out;
+ ret = process_all_new_xattrs(sctx);
+ if (ret < 0)
+ goto out;
+ }
} else {
sctx->cur_inode_gen = left_gen;
- sctx->cur_inode_new = 0;
- sctx->cur_inode_new_gen = 0;
- sctx->cur_inode_deleted = 0;
+ sctx->cur_inode_new = false;
+ sctx->cur_inode_new_gen = false;
+ sctx->cur_inode_deleted = false;
sctx->cur_inode_size = btrfs_inode_size(
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
@@ -6542,18 +6829,27 @@ static int changed_extent(struct send_ctx *sctx,
return ret;
}
+static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
+{
+ int ret = 0;
+
+ if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+ if (result == BTRFS_COMPARE_TREE_NEW)
+ sctx->cur_inode_needs_verity = true;
+ }
+ return ret;
+}
+
static int dir_changed(struct send_ctx *sctx, u64 dir)
{
u64 orig_gen, new_gen;
int ret;
- ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
- NULL, NULL);
+ ret = get_inode_gen(sctx->send_root, dir, &new_gen);
if (ret)
return ret;
- ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
- NULL, NULL, NULL);
+ ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
if (ret)
return ret;
@@ -6696,6 +6992,9 @@ static int changed_cb(struct btrfs_path *left_path,
ret = changed_xattr(sctx, result);
else if (key->type == BTRFS_EXTENT_DATA_KEY)
ret = changed_extent(sctx, result);
+ else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
+ key->offset == 0)
+ ret = changed_verity(sctx, result);
}
out:
@@ -7549,6 +7848,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
} else {
sctx->proto = 1;
}
+ if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
+ ret = -EINVAL;
+ goto out;
+ }
sctx->send_filp = fget(arg->send_fd);
if (!sctx->send_filp) {
@@ -7568,8 +7871,31 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
sctx->clone_roots_cnt = arg->clone_sources_count;
- sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ if (sctx->proto >= 2) {
+ u32 send_buf_num_pages;
+
+ sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+ sctx->send_buf = vmalloc(sctx->send_max_size);
+ if (!sctx->send_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
+ sctx->send_buf_pages = kcalloc(send_buf_num_pages,
+ sizeof(*sctx->send_buf_pages),
+ GFP_KERNEL);
+ if (!sctx->send_buf_pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < send_buf_num_pages; i++) {
+ sctx->send_buf_pages[i] =
+ vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
+ }
+ } else {
+ sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
+ }
if (!sctx->send_buf) {
ret = -ENOMEM;
goto out;
@@ -7578,6 +7904,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
+ sctx->rbtree_new_refs = RB_ROOT;
+ sctx->rbtree_deleted_refs = RB_ROOT;
sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
arg->clone_sources_count + 1,
@@ -7762,10 +8090,14 @@ out:
fput(sctx->send_filp);
kvfree(sctx->clone_roots);
+ kfree(sctx->send_buf_pages);
kvfree(sctx->send_buf);
+ kvfree(sctx->verity_descriptor);
name_cache_free(sctx);
+ close_current_inode(sctx);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 08602fdd600a..0a4537775e0c 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -7,12 +7,19 @@
#ifndef BTRFS_SEND_H
#define BTRFS_SEND_H
-#include "ctree.h"
+#include <linux/types.h>
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
-#define BTRFS_SEND_STREAM_VERSION 1
+#define BTRFS_SEND_STREAM_VERSION 2
-#define BTRFS_SEND_BUF_SIZE SZ_64K
+/*
+ * In send stream v1, no command is larger than 64K. In send stream v2, no limit
+ * should be assumed.
+ */
+#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
+
+struct inode;
+struct btrfs_ioctl_send_args;
enum btrfs_tlv_type {
BTRFS_TLV_U8,
@@ -46,87 +53,126 @@ struct btrfs_tlv_header {
/* commands */
enum btrfs_send_cmd {
- BTRFS_SEND_C_UNSPEC,
+ BTRFS_SEND_C_UNSPEC = 0,
/* Version 1 */
- BTRFS_SEND_C_SUBVOL,
- BTRFS_SEND_C_SNAPSHOT,
+ BTRFS_SEND_C_SUBVOL = 1,
+ BTRFS_SEND_C_SNAPSHOT = 2,
- BTRFS_SEND_C_MKFILE,
- BTRFS_SEND_C_MKDIR,
- BTRFS_SEND_C_MKNOD,
- BTRFS_SEND_C_MKFIFO,
- BTRFS_SEND_C_MKSOCK,
- BTRFS_SEND_C_SYMLINK,
+ BTRFS_SEND_C_MKFILE = 3,
+ BTRFS_SEND_C_MKDIR = 4,
+ BTRFS_SEND_C_MKNOD = 5,
+ BTRFS_SEND_C_MKFIFO = 6,
+ BTRFS_SEND_C_MKSOCK = 7,
+ BTRFS_SEND_C_SYMLINK = 8,
- BTRFS_SEND_C_RENAME,
- BTRFS_SEND_C_LINK,
- BTRFS_SEND_C_UNLINK,
- BTRFS_SEND_C_RMDIR,
+ BTRFS_SEND_C_RENAME = 9,
+ BTRFS_SEND_C_LINK = 10,
+ BTRFS_SEND_C_UNLINK = 11,
+ BTRFS_SEND_C_RMDIR = 12,
- BTRFS_SEND_C_SET_XATTR,
- BTRFS_SEND_C_REMOVE_XATTR,
+ BTRFS_SEND_C_SET_XATTR = 13,
+ BTRFS_SEND_C_REMOVE_XATTR = 14,
- BTRFS_SEND_C_WRITE,
- BTRFS_SEND_C_CLONE,
+ BTRFS_SEND_C_WRITE = 15,
+ BTRFS_SEND_C_CLONE = 16,
- BTRFS_SEND_C_TRUNCATE,
- BTRFS_SEND_C_CHMOD,
- BTRFS_SEND_C_CHOWN,
- BTRFS_SEND_C_UTIMES,
+ BTRFS_SEND_C_TRUNCATE = 17,
+ BTRFS_SEND_C_CHMOD = 18,
+ BTRFS_SEND_C_CHOWN = 19,
+ BTRFS_SEND_C_UTIMES = 20,
- BTRFS_SEND_C_END,
- BTRFS_SEND_C_UPDATE_EXTENT,
- __BTRFS_SEND_C_MAX_V1,
+ BTRFS_SEND_C_END = 21,
+ BTRFS_SEND_C_UPDATE_EXTENT = 22,
+ BTRFS_SEND_C_MAX_V1 = 22,
/* Version 2 */
- __BTRFS_SEND_C_MAX_V2,
-
+ BTRFS_SEND_C_FALLOCATE = 23,
+ BTRFS_SEND_C_FILEATTR = 24,
+ BTRFS_SEND_C_ENCODED_WRITE = 25,
+ BTRFS_SEND_C_MAX_V2 = 25,
+
+ /* Version 3 */
+ BTRFS_SEND_C_ENABLE_VERITY = 26,
+ BTRFS_SEND_C_MAX_V3 = 26,
/* End */
- __BTRFS_SEND_C_MAX,
+ BTRFS_SEND_C_MAX = 26,
};
-#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
/* attributes in send stream */
enum {
- BTRFS_SEND_A_UNSPEC,
-
- BTRFS_SEND_A_UUID,
- BTRFS_SEND_A_CTRANSID,
-
- BTRFS_SEND_A_INO,
- BTRFS_SEND_A_SIZE,
- BTRFS_SEND_A_MODE,
- BTRFS_SEND_A_UID,
- BTRFS_SEND_A_GID,
- BTRFS_SEND_A_RDEV,
- BTRFS_SEND_A_CTIME,
- BTRFS_SEND_A_MTIME,
- BTRFS_SEND_A_ATIME,
- BTRFS_SEND_A_OTIME,
-
- BTRFS_SEND_A_XATTR_NAME,
- BTRFS_SEND_A_XATTR_DATA,
-
- BTRFS_SEND_A_PATH,
- BTRFS_SEND_A_PATH_TO,
- BTRFS_SEND_A_PATH_LINK,
-
- BTRFS_SEND_A_FILE_OFFSET,
- BTRFS_SEND_A_DATA,
-
- BTRFS_SEND_A_CLONE_UUID,
- BTRFS_SEND_A_CLONE_CTRANSID,
- BTRFS_SEND_A_CLONE_PATH,
- BTRFS_SEND_A_CLONE_OFFSET,
- BTRFS_SEND_A_CLONE_LEN,
-
- __BTRFS_SEND_A_MAX,
+ BTRFS_SEND_A_UNSPEC = 0,
+
+ /* Version 1 */
+ BTRFS_SEND_A_UUID = 1,
+ BTRFS_SEND_A_CTRANSID = 2,
+
+ BTRFS_SEND_A_INO = 3,
+ BTRFS_SEND_A_SIZE = 4,
+ BTRFS_SEND_A_MODE = 5,
+ BTRFS_SEND_A_UID = 6,
+ BTRFS_SEND_A_GID = 7,
+ BTRFS_SEND_A_RDEV = 8,
+ BTRFS_SEND_A_CTIME = 9,
+ BTRFS_SEND_A_MTIME = 10,
+ BTRFS_SEND_A_ATIME = 11,
+ BTRFS_SEND_A_OTIME = 12,
+
+ BTRFS_SEND_A_XATTR_NAME = 13,
+ BTRFS_SEND_A_XATTR_DATA = 14,
+
+ BTRFS_SEND_A_PATH = 15,
+ BTRFS_SEND_A_PATH_TO = 16,
+ BTRFS_SEND_A_PATH_LINK = 17,
+
+ BTRFS_SEND_A_FILE_OFFSET = 18,
+ /*
+ * As of send stream v2, this attribute is special: it must be the last
+ * attribute in a command, its header contains only the type, and its
+ * length is implicitly the remaining length of the command.
+ */
+ BTRFS_SEND_A_DATA = 19,
+
+ BTRFS_SEND_A_CLONE_UUID = 20,
+ BTRFS_SEND_A_CLONE_CTRANSID = 21,
+ BTRFS_SEND_A_CLONE_PATH = 22,
+ BTRFS_SEND_A_CLONE_OFFSET = 23,
+ BTRFS_SEND_A_CLONE_LEN = 24,
+
+ BTRFS_SEND_A_MAX_V1 = 24,
+
+ /* Version 2 */
+ BTRFS_SEND_A_FALLOCATE_MODE = 25,
+
+ /*
+ * File attributes from the FS_*_FL namespace (i_flags, xflags),
+ * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored
+ * in btrfs_inode_item::flags (represented by btrfs_inode::flags and
+ * btrfs_inode::ro_flags).
+ */
+ BTRFS_SEND_A_FILEATTR = 26,
+
+ BTRFS_SEND_A_UNENCODED_FILE_LEN = 27,
+ BTRFS_SEND_A_UNENCODED_LEN = 28,
+ BTRFS_SEND_A_UNENCODED_OFFSET = 29,
+ /*
+ * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from
+ * BTRFS_SEND_C_ENCODED_WRITE.
+ */
+ BTRFS_SEND_A_COMPRESSION = 30,
+ BTRFS_SEND_A_ENCRYPTION = 31,
+ BTRFS_SEND_A_MAX_V2 = 31,
+
+ /* Version 3 */
+ BTRFS_SEND_A_VERITY_ALGORITHM = 32,
+ BTRFS_SEND_A_VERITY_BLOCK_SIZE = 33,
+ BTRFS_SEND_A_VERITY_SALT_DATA = 34,
+ BTRFS_SEND_A_VERITY_SIG_DATA = 35,
+ BTRFS_SEND_A_MAX_V3 = 35,
+
+ __BTRFS_SEND_A_MAX = 35,
};
-#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
-#ifdef __KERNEL__
long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
-#endif
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index b87931a458eb..f171bf875633 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -9,6 +9,7 @@
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
+#include "zoned.h"
/*
* HOW DOES SPACE RESERVATION WORK
@@ -181,6 +182,43 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
found->full = 0;
}
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+
+/*
+ * Calculate chunk size depending on volume type (regular or zoned).
+ */
+static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
+{
+ if (btrfs_is_zoned(fs_info))
+ return fs_info->zone_size;
+
+ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ return BTRFS_MAX_DATA_CHUNK_SIZE;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return SZ_32M;
+
+ /* Handle BTRFS_BLOCK_GROUP_METADATA */
+ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ return SZ_1G;
+
+ return SZ_256M;
+}
+
+/*
+ * Update default chunk size.
+ */
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size)
+{
+ WRITE_ONCE(space_info->chunk_size, chunk_size);
+}
+
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@@ -202,6 +240,10 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
+ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
@@ -251,30 +293,36 @@ out:
return ret;
}
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly, u64 bytes_zone_unusable,
- struct btrfs_space_info **space_info)
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group)
{
struct btrfs_space_info *found;
- int factor;
+ int factor, index;
- factor = btrfs_bg_type_to_factor(flags);
+ factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, flags);
+ found = btrfs_find_space_info(info, block_group->flags);
ASSERT(found);
spin_lock(&found->lock);
- found->total_bytes += total_bytes;
- found->disk_total += total_bytes * factor;
- found->bytes_used += bytes_used;
- found->disk_used += bytes_used * factor;
- found->bytes_readonly += bytes_readonly;
- found->bytes_zone_unusable += bytes_zone_unusable;
- if (total_bytes > 0)
+ found->total_bytes += block_group->length;
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+ found->active_total_bytes += block_group->length;
+ found->disk_total += block_group->length * factor;
+ found->bytes_used += block_group->used;
+ found->disk_used += block_group->used * factor;
+ found->bytes_readonly += block_group->bytes_super;
+ found->bytes_zone_unusable += block_group->zone_unusable;
+ if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
spin_unlock(&found->lock);
- *space_info = found;
+
+ block_group->space_info = found;
+
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+ down_write(&found->groups_sem);
+ list_add_tail(&block_group->list, &found->block_groups[index]);
+ up_write(&found->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -328,6 +376,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
return avail;
}
+static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ /*
+ * On regular filesystem, all total_bytes are always writable. On zoned
+ * filesystem, there may be a limitation imposed by max_active_zones.
+ * For metadata allocation, we cannot finish an existing active block
+ * group to avoid a deadlock. Thus, we need to consider only the active
+ * groups to be writable for metadata space.
+ */
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return space_info->total_bytes;
+
+ return space_info->active_total_bytes;
+}
+
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
@@ -340,9 +404,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
used = btrfs_space_info_used(space_info, true);
- avail = calc_available_free_space(fs_info, space_info, flush);
+ if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
+ avail = 0;
+ else
+ avail = calc_available_free_space(fs_info, space_info, flush);
- if (used + bytes < space_info->total_bytes + avail)
+ if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
return 1;
return 0;
}
@@ -378,7 +445,7 @@ again:
ticket = list_first_entry(head, struct reserve_ticket, list);
/* Check and see if our ticket can be satisfied now. */
- if ((used + ticket->bytes <= space_info->total_bytes) ||
+ if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
@@ -409,28 +476,47 @@ do { \
spin_unlock(&__rsv->lock); \
} while (0)
+static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
+{
+ switch (space_info->flags) {
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "SYSTEM";
+ case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+ return "DATA+METADATA";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "DATA";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "METADATA";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+}
+
static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info)
{
+ const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
- info->flags,
+ btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
+ flag_str,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
+"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
info->bytes_readonly, info->bytes_zone_unusable);
-
- DUMP_BLOCK_RSV(fs_info, global_block_rsv);
- DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
- DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
- DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
-
}
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -442,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
spin_lock(&info->lock);
__btrfs_dump_space_info(fs_info, info);
+ dump_global_block_rsv(fs_info);
spin_unlock(&info->lock);
if (!dump_block_groups)
@@ -519,7 +606,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
}
- trans = (struct btrfs_trans_handle *)current->journal_info;
+ trans = current->journal_info;
/*
* If we are doing more ordered than delalloc we need to just wait on
@@ -662,6 +749,18 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case ALLOC_CHUNK:
case ALLOC_CHUNK_FORCE:
+ /*
+ * For metadata space on zoned filesystem, reaching here means we
+ * don't have enough space left in active_total_bytes. Try to
+ * activate a block group first, because we may have inactive
+ * block group already allocated.
+ */
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
+ if (ret < 0)
+ break;
+ else if (ret == 1)
+ break;
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -672,6 +771,23 @@ static void flush_space(struct btrfs_fs_info *fs_info,
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
+
+ /*
+ * For metadata space on zoned filesystem, allocating a new chunk
+ * is not enough. We still need to activate the block * group.
+ * Active the newly allocated block group by (maybe) finishing
+ * a block group.
+ */
+ if (ret == 1) {
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
+ /*
+ * Revert to the original ret regardless we could finish
+ * one block group or not.
+ */
+ if (ret >= 0)
+ ret = 1;
+ }
+
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
@@ -709,6 +825,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
{
u64 used;
u64 avail;
+ u64 total;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
@@ -723,8 +840,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
* space. If that's the case add in our overage so we make sure to put
* appropriate pressure on the flushing state machine.
*/
- if (space_info->total_bytes + avail < used)
- to_reclaim += used - (space_info->total_bytes + avail);
+ total = writable_total_bytes(fs_info, space_info);
+ if (total + avail < used)
+ to_reclaim += used - (total + avail);
return to_reclaim;
}
@@ -734,9 +852,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
{
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
- u64 thresh = div_factor_fine(space_info->total_bytes, 90);
+ u64 total = writable_total_bytes(fs_info, space_info);
+ u64 thresh;
u64 used;
+ thresh = div_factor_fine(total, 90);
+
lockdep_assert_held(&space_info->lock);
/* If we're just plain full then async reclaim just slows us down. */
@@ -798,8 +919,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
- if (used < space_info->total_bytes)
- thresh += space_info->total_bytes - used;
+ if (used < total)
+ thresh += total - used;
thresh >>= space_info->clamp;
used = space_info->bytes_pinned;
@@ -1271,7 +1392,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
/*
* This is the priority reclaim path, so to_reclaim could be >0 still
- * because we may have only satisified the priority tickets and still
+ * because we may have only satisfied the priority tickets and still
* left non priority tickets on the list. We would then have
* to_reclaim but ->bytes == 0.
*/
@@ -1516,7 +1637,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* can_overcommit() to ensure we can overcommit to continue.
*/
if (!pending_tickets &&
- ((used + orig_bytes <= space_info->total_bytes) ||
+ ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
@@ -1565,7 +1686,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
&space_info->priority_tickets);
}
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
- used += orig_bytes;
/*
* We will do the space reservation dance during log replay,
* which means we won't have fs_info->fs_root set, so don't do
@@ -1640,7 +1760,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
- flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+ flush == BTRFS_RESERVE_NO_FLUSH);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
@@ -1652,3 +1773,17 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
}
return ret;
}
+
+/* Dump all the space infos when we abort a transaction due to ENOSPC. */
+__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ btrfs_info(fs_info, "dumping space info:");
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ spin_lock(&space_info->lock);
+ __btrfs_dump_space_info(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+ }
+ dump_global_block_rsv(fs_info);
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index d841fed73492..ce66023a9eb8 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
+#include "volumes.h"
+
struct btrfs_space_info {
spinlock_t lock;
@@ -17,12 +19,22 @@ struct btrfs_space_info {
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ /* Total bytes in the space, but only accounts active block groups. */
+ u64 active_total_bytes;
u64 bytes_zone_unusable; /* total bytes that are unusable until
resetting the device zone */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
+ /* Chunk size in bytes */
+ u64 chunk_size;
+
+ /*
+ * Once a block group drops below this threshold (percents) we'll
+ * schedule it for reclaim.
+ */
+ int bg_reclaim_threshold;
int clamp; /* Used to scale our threshold for preemptive
flushing. The value is >> clamp, so turns
@@ -111,10 +123,10 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
- u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly, u64 bytes_zone_unusable,
- struct btrfs_space_info **space_info);
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+ struct btrfs_block_group *block_group);
+void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
@@ -145,4 +157,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
+void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index f429256f56db..12455b2b41de 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
{
const unsigned long member_offset = (unsigned long)ptr + off;
- if (member_offset > eb->len) {
+ if (unlikely(member_offset + size > eb->len)) {
btrfs_warn(eb->fs_info,
- "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d",
- (unsigned long)ptr, eb->start, member_offset, size);
- return false;
- }
- if (member_offset + size > eb->len) {
- btrfs_warn(eb->fs_info,
- "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d",
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
(unsigned long)ptr, eb->start, member_offset, size);
return false;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index ef7ae20d2b77..6fc2b77ae5c3 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,6 +63,29 @@
* This means a slightly higher tree locking latency.
*/
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+{
+ if (fs_info->sectorsize >= PAGE_SIZE)
+ return false;
+
+ /*
+ * Only data pages (either through DIO or compression) can have no
+ * mapping. And if page->mapping->host is data inode, it's subpage.
+ * As we have ruled our sectorsize >= PAGE_SIZE case already.
+ */
+ if (!page->mapping || !page->mapping->host ||
+ is_data_inode(page->mapping->host))
+ return true;
+
+ /*
+ * Now the only remaining case is metadata, which we only go subpage
+ * routine if nodesize < PAGE_SIZE.
+ */
+ if (fs_info->nodesize < PAGE_SIZE)
+ return true;
+ return false;
+}
+
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
{
unsigned int cur = 0;
@@ -100,14 +123,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage *subpage;
/*
- * We have cases like a dummy extent buffer page, which is not mappped
+ * We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
if (page->mapping)
ASSERT(PageLocked(page));
/* Either not subpage, or the page already has private attached */
- if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
+ if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
@@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage *subpage;
/* Either not subpage, or already detached */
- if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
+ if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
return;
- subpage = (struct btrfs_subpage *)detach_page_private(page);
+ subpage = detach_page_private(page);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@@ -319,7 +342,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
lock_page(page);
return 0;
}
@@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
return unlock_page(page);
btrfs_subpage_clamp_range(page, &start, &len);
if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
@@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
} \
void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
btrfs_subpage_clamp_range(page, &start, &len); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
@@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(!PageDirty(page));
- if (fs_info->sectorsize == PAGE_SIZE)
+ if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->private);
@@ -708,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* It should not have any subpage::writers count.
* Can be unlocked by unlock_page().
* This is the most common locked page for __extent_writepage() called
- * inside extent_write_cache_pages() or extent_write_full_page().
+ * inside extent_write_cache_pages().
* Rarer cases include the @locked_page from extent_write_locked_range().
*
* - Page locked by lock_delalloc_pages()
@@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
struct btrfs_subpage *subpage;
ASSERT(PageLocked(page));
- /* For regular page size case, we just unlock the page */
- if (fs_info->sectorsize == PAGE_SIZE)
+ /* For non-subpage case, we just unlock the page */
+ if (!btrfs_is_subpage(fs_info, page))
return unlock_page(page);
ASSERT(PagePrivate(page) && page->private);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 7accb5c40d33..0e80ad336904 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -74,6 +74,8 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b228efe8ab6e..9be4fd2db0f4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,7 @@
#include "block-group.h"
#include "discard.h"
#include "qgroup.h"
+#include "raid56.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -72,7 +73,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data);
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
/*
- * Characters to print to indicate error conditions or uncommon filesystem sate.
+ * Characters to print to indicate error conditions or uncommon filesystem state.
* RO is not an error.
*/
static const char fs_state_chars[] = {
@@ -261,7 +262,7 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@@ -292,10 +293,10 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
char statestr[STATE_STRING_BUF_LEN];
btrfs_state_to_string(fs_info, statestr);
- printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
fs_info->sb->s_id, statestr, &vaf);
} else {
- printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
}
}
@@ -345,12 +346,14 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
- unsigned int line, int errno)
+ unsigned int line, int errno, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
WRITE_ONCE(trans->transaction->aborted, errno);
+ if (first_hit && errno == -ENOSPC)
+ btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
@@ -625,6 +628,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
+ const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
@@ -763,6 +767,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
compress_force = false;
no_compress++;
} else {
+ btrfs_err(info, "unrecognized compression value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -821,8 +827,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
+ btrfs_err(info, "unrecognized thread_pool value %s",
+ args[0].from);
goto out;
} else if (intarg == 0) {
+ btrfs_err(info, "invalid value 0 for thread_pool");
ret = -EINVAL;
goto out;
}
@@ -883,8 +892,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized metadata_ratio value %s",
+ args[0].from);
goto out;
+ }
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
@@ -901,6 +913,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
+ btrfs_err(info, "unrecognized discard mode value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -933,6 +947,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
+ btrfs_err(info, "unrecognized space_cache value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -1014,8 +1030,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info,
+ "unrecognized check_integrity_print_mask value %s",
+ args[0].from);
goto out;
+ }
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
@@ -1030,13 +1050,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
#endif
case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0)
+ if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else if (strcmp(args[0].from, "bug") == 0)
+ } else if (strcmp(args[0].from, "bug") == 0) {
btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else {
+ } else {
+ btrfs_err(info, "unrecognized fatal_errors value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -1044,8 +1066,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized commit_interval value %s",
+ args[0].from);
+ ret = -EINVAL;
goto out;
+ }
if (intarg == 0) {
btrfs_info(info,
"using default commit interval %us",
@@ -1059,8 +1085,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_rescue:
ret = parse_rescue_options(info, args[0].from);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_err(info, "unrecognized rescue value %s",
+ args[0].from);
goto out;
+ }
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
@@ -1111,10 +1140,12 @@ out:
}
if (!ret)
ret = btrfs_check_mountopts_zoned(info);
- if (!ret && btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
+ if (!ret && !remounting) {
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ btrfs_info(info, "using free space tree");
+ }
return ret;
}
@@ -1790,6 +1821,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
+ s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
if (!strstr(crc32c_impl(), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
@@ -1903,17 +1936,12 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
- new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
- new_pool_size);
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
@@ -1986,6 +2014,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
+ ret = btrfs_check_features(fs_info, sb);
+ if (ret < 0)
+ goto restore;
+
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
@@ -2214,12 +2246,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
if (type & BTRFS_BLOCK_GROUP_RAID0)
num_stripes = nr_devices;
- else if (type & BTRFS_BLOCK_GROUP_RAID1)
- num_stripes = 2;
- else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
- num_stripes = 3;
- else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
- num_stripes = 4;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK)
+ num_stripes = rattr->ncopies;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
@@ -2243,17 +2271,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
/*
- * In order to avoid overwriting the superblock on the drive,
- * btrfs starts at an offset of at least 1MB when doing chunk
- * allocation.
- *
- * This ensures we have at least min_stripe_size free space
- * after excluding 1MB.
+ * Ensure we have at least min_stripe_size on top of the
+ * reserved space on the device.
*/
- if (avail_space <= SZ_1M + min_stripe_size)
+ if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size)
continue;
- avail_space -= SZ_1M;
+ avail_space -= BTRFS_DEVICE_RANGE_RESERVED;
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
@@ -2527,11 +2551,71 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans);
}
+static int check_dev_super(struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_super_block *sb;
+ int ret = 0;
+
+ /* This should be called with fs still frozen. */
+ ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
+
+ /* Missing dev, no need to check. */
+ if (!dev->bdev)
+ return 0;
+
+ /* Only need to check the primary super block. */
+ sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ /* Btrfs_validate_super() includes fsid check against super->fsid. */
+ ret = btrfs_validate_super(fs_info, sb, 0);
+ if (ret < 0)
+ goto out;
+
+ if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+ btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
+ btrfs_super_generation(sb),
+ fs_info->last_trans_committed);
+ ret = -EUCLEAN;
+ goto out;
+ }
+out:
+ btrfs_release_disk_super(sb);
+ return ret;
+}
+
static int btrfs_unfreeze(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_device *device;
+ int ret = 0;
+ /*
+ * Make sure the fs is not changed by accident (like hibernation then
+ * modified by other OS).
+ * If we found anything wrong, we mark the fs error immediately.
+ *
+ * And since the fs is frozen, no one can modify the fs yet, thus
+ * we don't need to hold device_list_mutex.
+ */
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ ret = check_dev_super(device);
+ if (ret < 0) {
+ btrfs_handle_fs_error(fs_info, ret,
+ "super block on devid %llu got modified unexpectedly",
+ device->devid);
+ break;
+ }
+ }
clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+
+ /*
+ * We still return 0, to allow VFS layer to unfreeze the fs even the
+ * above checks failed. Since the fs is either fine or read-only, we're
+ * safe to continue, without causing further damage.
+ */
return 0;
}
@@ -2639,17 +2723,21 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_compress;
- err = extent_io_init();
+ err = extent_state_init_cachep();
if (err)
goto free_cachep;
- err = extent_state_cache_init();
+ err = extent_buffer_init_cachep();
if (err)
- goto free_extent_io;
+ goto free_extent_cachep;
+
+ err = btrfs_bioset_init();
+ if (err)
+ goto free_eb_cachep;
err = extent_map_init();
if (err)
- goto free_extent_state_cache;
+ goto free_bioset;
err = ordered_data_init();
if (err)
@@ -2671,13 +2759,9 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_delayed_ref;
- err = btrfs_end_io_wq_init();
- if (err)
- goto free_prelim_ref;
-
err = btrfs_interface_init();
if (err)
- goto free_end_io_wq;
+ goto free_prelim_ref;
btrfs_print_mod_info();
@@ -2693,8 +2777,6 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
-free_end_io_wq:
- btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
@@ -2707,10 +2789,12 @@ free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
-free_extent_state_cache:
- extent_state_cache_exit();
-free_extent_io:
- extent_io_exit();
+free_bioset:
+ btrfs_bioset_exit();
+free_eb_cachep:
+ extent_buffer_free_cachep();
+free_extent_cachep:
+ extent_state_free_cachep();
free_cachep:
btrfs_destroy_cachep();
free_compress:
@@ -2729,10 +2813,10 @@ static void __exit exit_btrfs_fs(void)
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
- extent_state_cache_exit();
- extent_io_exit();
+ btrfs_bioset_exit();
+ extent_state_free_cachep();
+ extent_buffer_free_cachep();
btrfs_interface_exit();
- btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 17389a42a3ab..699b54b3acaa 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,6 +21,7 @@
#include "space-info.h"
#include "block-group.h"
#include "qgroup.h"
+#include "misc.h"
/*
* Structure name Path
@@ -34,12 +35,12 @@
* qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
* space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>
* raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ * discard_attrs /sys/fs/btrfs/<uuid>/discard
*
* When built with BTRFS_CONFIG_DEBUG:
*
* btrfs_debug_feature_attrs /sys/fs/btrfs/debug
* btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
- * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard
*/
struct btrfs_feature_attr {
@@ -61,6 +62,10 @@ struct raid_kobject {
.store = _store, \
}
+#define BTRFS_ATTR_W(_prefix, _name, _store) \
+ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
+
#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \
static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \
__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
@@ -92,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
+static struct kobject *get_btrfs_kobj(struct kobject *kobj);
static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a)
{
@@ -270,22 +276,22 @@ static umode_t btrfs_feature_visible(struct kobject *kobj,
return mode;
}
-BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD);
-BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
-#ifdef CONFIG_BTRFS_DEBUG
-/* Remove once support for zoned allocation is feature complete */
+#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
/* Remove once support for extent tree v2 is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
#endif
@@ -296,17 +302,15 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
/*
* Features which depend on feature bits and may differ between each fs.
*
- * /sys/fs/btrfs/features - all available features implemeted by this version
+ * /sys/fs/btrfs/features - all available features implemented by this version
* /sys/fs/btrfs/UUID/features - features of the fs which are enabled or
* can be changed on a mounted filesystem.
*/
static struct attribute *btrfs_supported_feature_attrs[] = {
- BTRFS_FEAT_ATTR_PTR(mixed_backref),
BTRFS_FEAT_ATTR_PTR(default_subvol),
BTRFS_FEAT_ATTR_PTR(mixed_groups),
BTRFS_FEAT_ATTR_PTR(compress_lzo),
BTRFS_FEAT_ATTR_PTR(compress_zstd),
- BTRFS_FEAT_ATTR_PTR(big_metadata),
BTRFS_FEAT_ATTR_PTR(extended_iref),
BTRFS_FEAT_ATTR_PTR(raid56),
BTRFS_FEAT_ATTR_PTR(skinny_metadata),
@@ -314,8 +318,11 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
-#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_FEAT_ATTR_PTR(block_group_tree),
+#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
+#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
#endif
#ifdef CONFIG_FS_VERITY
@@ -394,11 +401,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
- /* 4K sector size is also supported with 64K page size */
- if (PAGE_SIZE == SZ_64K)
+ /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (PAGE_SIZE > SZ_4K)
ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
-
- /* Only sectorsize == PAGE_SIZE is now supported */
ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
@@ -426,12 +431,10 @@ static const struct attribute_group btrfs_static_feature_attr_group = {
.attrs = btrfs_supported_static_feature_attrs,
};
-#ifdef CONFIG_BTRFS_DEBUG
-
/*
* Discard statistics and tunables
*/
-#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
+#define discard_to_fs_info(_kobj) to_fs_info(get_btrfs_kobj(_kobj))
static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -580,11 +583,11 @@ BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
btrfs_discard_max_discard_size_store);
/*
- * Per-filesystem debugging of discard (when mounted with discard=async).
+ * Per-filesystem stats for discard (when mounted with discard=async).
*
- * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ * Path: /sys/fs/btrfs/<uuid>/discard/
*/
-static const struct attribute *discard_debug_attrs[] = {
+static const struct attribute *discard_attrs[] = {
BTRFS_ATTR_PTR(discard, discardable_bytes),
BTRFS_ATTR_PTR(discard, discardable_extents),
BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
@@ -596,6 +599,8 @@ static const struct attribute *discard_debug_attrs[] = {
NULL,
};
+#ifdef CONFIG_BTRFS_DEBUG
+
/*
* Per-filesystem runtime debugging exported via sysfs.
*
@@ -711,6 +716,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
} \
BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
+static ssize_t btrfs_chunk_size_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size));
+}
+
+/*
+ * Store new chunk size in space info. Can be called on a read-only filesystem.
+ *
+ * If the new chunk size value is larger than 10% of free space it is reduced
+ * to match that limit. Alignment must be to 256M and the system chunk size
+ * cannot be set.
+ */
+static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ char *retptr;
+ u64 val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!fs_info->fs_devices)
+ return -EINVAL;
+
+ if (btrfs_is_zoned(fs_info))
+ return -EINVAL;
+
+ /* System block type must not be changed. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return -EPERM;
+
+ val = memparse(buf, &retptr);
+ /* There could be trailing '\n', also catch any typos after the value */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || val == 0)
+ return -EINVAL;
+
+ val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
+
+ /* Limit stripe size to 10% of available space. */
+ val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
+
+ /* Must be multiple of 256M. */
+ val &= ~((u64)SZ_256M - 1);
+
+ /* Must be at least 256M. */
+ if (val < SZ_256M)
+ return -EINVAL;
+
+ btrfs_update_space_info_chunk_size(space_info, val);
+
+ return len;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Request chunk allocation with current chunk size.
+ */
+static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj));
+ struct btrfs_trans_handle *trans;
+ bool val;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (sb_rdonly(fs_info->sb))
+ return -EROFS;
+
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
+
+ if (!val)
+ return -EINVAL;
+
+ /*
+ * This is unsafe to be called from sysfs context and may cause
+ * unexpected problems.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_force_chunk_alloc(trans, space_info->flags);
+ btrfs_end_transaction(trans);
+
+ if (ret == 1)
+ return len;
+
+ return -ENOSPC;
+}
+BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store);
+
+#endif
+
SPACE_INFO_ATTR(flags);
SPACE_INFO_ATTR(total_bytes);
SPACE_INFO_ATTR(bytes_used);
@@ -721,6 +832,40 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+}
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh < 0 || thresh > 100)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
+ btrfs_sinfo_bg_reclaim_threshold_show,
+ btrfs_sinfo_bg_reclaim_threshold_store);
/*
* Allocation information about block group types.
@@ -738,6 +883,11 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
+ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, chunk_size),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(space_info);
@@ -836,6 +986,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
+static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return sysfs_emit(buf,
+ "commits %llu\n"
+ "last_commit_ms %llu\n"
+ "max_commit_ms %llu\n"
+ "total_commit_ms %llu\n",
+ fs_info->commit_stats.commit_count,
+ div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
+ div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
+}
+
+static ssize_t btrfs_commit_stats_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ unsigned long val;
+ int ret;
+
+ if (!fs_info)
+ return -EPERM;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 10, &val);
+ if (ret)
+ return ret;
+ if (val)
+ return -EINVAL;
+
+ WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0);
+
+ return len;
+}
+BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store);
+
static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -922,6 +1114,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
case BTRFS_EXCLOP_BALANCE:
str = "balance\n";
break;
+ case BTRFS_EXCLOP_BALANCE_PAUSED:
+ str = "balance paused\n";
+ break;
case BTRFS_EXCLOP_DEV_ADD:
str = "device add\n";
break;
@@ -954,25 +1149,6 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
}
BTRFS_ATTR(, generation, btrfs_generation_show);
-/*
- * Look for an exact string @string in @buffer with possible leading or
- * trailing whitespace
- */
-static bool strmatch(const char *buffer, const char *string)
-{
- const size_t len = strlen(string);
-
- /* Skip leading whitespace */
- buffer = skip_spaces(buffer);
-
- /* Match entire string, check if the rest is whitespace or empty */
- if (strncmp(string, buffer, len) == 0 &&
- strlen(skip_spaces(buffer + len)) == 0)
- return true;
-
- return false;
-}
-
static const char * const btrfs_read_policy_name[] = { "pid" };
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1006,7 +1182,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
- if (strmatch(buf, btrfs_read_policy_name[i])) {
+ if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
if (i != fs_devices->read_policy) {
fs_devices->read_policy = i;
btrfs_info(fs_devices->fs_info,
@@ -1026,11 +1202,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- ssize_t ret;
-
- ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
- return ret;
+ return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
}
static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -1072,6 +1245,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, generation),
BTRFS_ATTR_PTR(, read_policy),
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(, commit_stats),
NULL,
};
@@ -1102,6 +1276,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
return to_fs_devs(kobj)->fs_info;
}
+static struct kobject *get_btrfs_kobj(struct kobject *kobj)
+{
+ while (kobj) {
+ if (kobj->ktype == &btrfs_ktype)
+ return kobj;
+ kobj = kobj->parent;
+ }
+ return NULL;
+}
+
#define NUM_FEATURE_BITS 64
#define BTRFS_FEATURE_NAME_MAX 13
static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
@@ -1220,13 +1404,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
-#ifdef CONFIG_BTRFS_DEBUG
- if (fs_info->discard_debug_kobj) {
- sysfs_remove_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
- kobject_del(fs_info->discard_debug_kobj);
- kobject_put(fs_info->discard_debug_kobj);
+ if (fs_info->discard_kobj) {
+ sysfs_remove_files(fs_info->discard_kobj, discard_attrs);
+ kobject_del(fs_info->discard_kobj);
+ kobject_put(fs_info->discard_kobj);
}
+#ifdef CONFIG_BTRFS_DEBUG
if (fs_info->debug_kobj) {
sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
kobject_del(fs_info->debug_kobj);
@@ -1794,20 +1977,18 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
if (error)
goto failure;
+#endif
/* Discard directory */
- fs_info->discard_debug_kobj = kobject_create_and_add("discard",
- fs_info->debug_kobj);
- if (!fs_info->discard_debug_kobj) {
+ fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
+ if (!fs_info->discard_kobj) {
error = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_debug_kobj,
- discard_debug_attrs);
+ error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
if (error)
goto failure;
-#endif
error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
@@ -1834,6 +2015,98 @@ failure:
return error;
}
+static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool enabled;
+
+ spin_lock(&fs_info->qgroup_lock);
+ enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", enabled);
+}
+BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+
+static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ bool inconsistent;
+
+ spin_lock(&fs_info->qgroup_lock);
+ inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT);
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", inconsistent);
+}
+BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show);
+
+static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 result;
+
+ spin_lock(&fs_info->qgroup_lock);
+ result = fs_info->qgroup_drop_subtree_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return sysfs_emit(buf, "%d\n", result);
+}
+
+static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+ u8 new_thres;
+ int ret;
+
+ ret = kstrtou8(buf, 10, &new_thres);
+ if (ret)
+ return -EINVAL;
+
+ if (new_thres > BTRFS_MAX_LEVEL)
+ return -EINVAL;
+
+ spin_lock(&fs_info->qgroup_lock);
+ fs_info->qgroup_drop_subtree_thres = new_thres;
+ spin_unlock(&fs_info->qgroup_lock);
+
+ return len;
+}
+BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show,
+ qgroup_drop_subtree_thres_store);
+
+/*
+ * Qgroups global info
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/
+ */
+static struct attribute *qgroups_attrs[] = {
+ BTRFS_ATTR_PTR(qgroups, enabled),
+ BTRFS_ATTR_PTR(qgroups, inconsistent),
+ BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+ NULL
+};
+ATTRIBUTE_GROUPS(qgroups);
+
+static void qgroups_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static struct kobj_type qgroups_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = qgroups_groups,
+ .release = qgroups_release,
+};
+
static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
{
return to_fs_info(kobj->parent->parent);
@@ -1959,11 +2232,15 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
if (fs_info->qgroups_kobj)
return 0;
- fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj);
- if (!fs_info->qgroups_kobj) {
- ret = -ENOMEM;
+ fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!fs_info->qgroups_kobj)
+ return -ENOMEM;
+
+ ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype,
+ fsid_kobj, "qgroups");
+ if (ret < 0)
goto out;
- }
+
rbtree_postorder_for_each_entry_safe(qgroup, next,
&fs_info->qgroup_tree, node) {
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
@@ -2068,4 +2345,3 @@ void __cold btrfs_exit_sysfs(void)
#endif
kset_unregister(btrfs_kset);
}
-
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d8e56edd6991..9c478fa256f6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -59,6 +59,7 @@ struct inode *btrfs_new_test_inode(void)
return NULL;
inode->i_mode = S_IFREG;
+ inode->i_ino = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
@@ -242,7 +243,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
kfree(cache->free_space_ctl);
kfree(cache);
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 51a8b075c259..b7d181a08eab 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -47,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
goto out;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, nodesize);
+ path->nodes[0] = eb;
if (!eb) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index a232b15b8021..f69ec4d2d6eb 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -80,7 +80,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
- PRINT_ONE_FLAG(state, dest, cur, DAMAGED);
PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
@@ -172,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
unlock_page(locked_page);
put_page(locked_page);
@@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(tmp, start, end);
+ unlock_extent(tmp, start, end, NULL);
/*
* Now to test where we run into a page that is no longer dirty in the
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 5930cdcae5cb..ebf68fcd2149 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache)
}
/* Cleanup */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/* Now with the extent entry offset into the bitmap */
ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
* [ bitmap ]
* [ del ]
*/
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_err("couldn't add bitmap %d", ret);
@@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return -1;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* This blew up before, we have part of the free space in a bitmap and
@@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
return ret;
}
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -629,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
if (ret)
return ret;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
/*
* Now test a similar scenario, but where our extent entry is located
@@ -819,7 +819,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
return ret;
cache->free_space_ctl->op = orig_free_space_ops;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
@@ -868,7 +868,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
/* Now validate bitmaps do the correct thing. */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
for (i = 0; i < 2; i++) {
offset = i * BITS_PER_BITMAP * sectorsize;
bytes = (i + 1) * SZ_1M;
@@ -891,7 +891,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
/* Now validate bitmaps with different ->max_extent_size. */
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
orig_free_space_ops = cache->free_space_ctl->op;
cache->free_space_ctl->op = &test_free_space_ops;
@@ -998,7 +998,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
}
cache->free_space_ctl->op = orig_free_space_ops;
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ btrfs_remove_free_space_cache(cache);
return 0;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index cac89c388131..625f7d398368 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -267,7 +267,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
free_extent_map(em);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
* All of the magic numbers are based on the mapping setup in
@@ -975,7 +975,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE >> 1,
(BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1043,7 +1043,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
BTRFS_MAX_EXTENT_SIZE + sectorsize,
BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1076,7 +1076,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
/* Empty */
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1092,7 +1092,7 @@ out:
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, 0, 0, NULL);
+ EXTENT_UPTODATE, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b008c5110958..d1f1da6820fb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
+#include <linux/timekeeping.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -160,7 +161,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
- struct btrfs_caching_control *caching_ctl, *next;
/*
* At this point no one can be using this transaction to modify any tree
@@ -195,46 +195,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
}
spin_unlock(&cur_trans->dropped_roots_lock);
- /*
- * We have to update the last_byte_to_unpin under the commit_root_sem,
- * at the same time we swap out the commit roots.
- *
- * This is because we must have a real view of the last spot the caching
- * kthreads were while caching. Consider the following views of the
- * extent tree for a block group
- *
- * commit root
- * +----+----+----+----+----+----+----+
- * |\\\\| |\\\\|\\\\| |\\\\|\\\\|
- * +----+----+----+----+----+----+----+
- * 0 1 2 3 4 5 6 7
- *
- * new commit root
- * +----+----+----+----+----+----+----+
- * | | | |\\\\| | |\\\\|
- * +----+----+----+----+----+----+----+
- * 0 1 2 3 4 5 6 7
- *
- * If the cache_ctl->progress was at 3, then we are only allowed to
- * unpin [0,1) and [2,3], because the caching thread has already
- * processed those extents. We are not allowed to unpin [5,6), because
- * the caching thread will re-start it's search from 3, and thus find
- * the hole from [4,6) to add to the free space cache.
- */
- spin_lock(&fs_info->block_group_cache_lock);
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- struct btrfs_block_group *cache = caching_ctl->block_group;
-
- if (btrfs_block_group_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
- spin_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -312,6 +272,8 @@ loop:
atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
return 0;
}
spin_unlock(&fs_info->trans_lock);
@@ -333,16 +295,23 @@ loop:
if (!cur_trans)
return -ENOMEM;
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+ btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
+
spin_lock(&fs_info->trans_lock);
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the checks above
*/
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
goto loop;
} else if (BTRFS_FS_ERROR(fs_info)) {
spin_unlock(&fs_info->trans_lock);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
kfree(cur_trans);
return -EROFS;
}
@@ -396,7 +365,7 @@ loop:
spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+ IO_TREE_TRANS_DIRTY_PAGES, NULL);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS, NULL);
fs_info->generation++;
@@ -540,6 +509,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
TRANS_ABORTED(cur_trans));
@@ -624,7 +594,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
*/
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- delayed_refs_rsv->full == 0) {
+ btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
delayed_refs_bytes = num_bytes;
num_bytes <<= 1;
}
@@ -649,7 +619,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
- !delayed_refs_rsv->full) {
+ !btrfs_block_rsv_full(delayed_refs_rsv)) {
/*
* Some people call with btrfs_start_transaction(root, 0)
* because they can be throttled, but have some other mechanism
@@ -858,6 +828,15 @@ static noinline void wait_for_commit(struct btrfs_transaction *commit,
u64 transid = commit->transid;
bool put = false;
+ /*
+ * At the moment this function is called with min_state either being
+ * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
+ */
+ if (min_state == TRANS_STATE_COMPLETED)
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ else
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
while (1) {
wait_event(commit->commit_wait, commit->state >= min_state);
if (put)
@@ -1021,6 +1000,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
extwriter_counter_dec(cur_trans, trans->type);
cond_wake_up(&cur_trans->writer_wait);
+
+ btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(info, btrfs_trans_num_writers);
+
btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
@@ -1133,7 +1116,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* it's safe to do it (through extent_io_tree_release()).
*/
err = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT, 0, 0, &cached_state);
+ EXTENT_NEED_WAIT, &cached_state);
if (err == -ENOMEM)
err = 0;
if (!err)
@@ -1831,8 +1814,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
dentry->d_name.len * 2);
- parent_inode->i_mtime = parent_inode->i_ctime =
- current_time(parent_inode);
+ parent_inode->i_mtime = current_time(parent_inode);
+ parent_inode->i_ctime = parent_inode->i_mtime;
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1911,14 +1894,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
-
- if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- root_item = &fs_info->block_group_root->root_item;
-
- super->block_group_root = root_item->bytenr;
- super->block_group_root_generation = root_item->generation;
- super->block_group_root_level = root_item->level;
- }
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -1966,6 +1941,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
+ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
@@ -1993,6 +1969,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has already released the lockdep map as reader
+ * already in btrfs_commit_transaction().
+ */
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
@@ -2098,20 +2080,31 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
+static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+{
+ fs_info->commit_stats.commit_count++;
+ fs_info->commit_stats.last_commit_dur = interval;
+ fs_info->commit_stats.max_commit_dur =
+ max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
+ fs_info->commit_stats.total_commit_dur += interval;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ ktime_t start_time;
+ ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- btrfs_end_transaction(trans);
- return ret;
+ goto lockdep_trans_commit_start_release;
}
btrfs_trans_release_metadata(trans);
@@ -2128,10 +2121,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Any running threads may add more while we are here.
*/
ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ if (ret)
+ goto lockdep_trans_commit_start_release;
}
btrfs_create_pending_block_groups(trans);
@@ -2160,10 +2151,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (run_it) {
ret = btrfs_start_dirty_block_groups(trans);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
+ if (ret)
+ goto lockdep_trans_commit_start_release;
}
}
@@ -2178,6 +2167,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
+
+ btrfs_trans_state_lockdep_release(fs_info,
+ BTRFS_LOCKDEP_TRANS_COMMIT_START);
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans, want_state);
@@ -2191,6 +2183,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
if (cur_trans->list.prev != &fs_info->trans_list) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
@@ -2210,7 +2203,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_put_transaction(prev_trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
} else {
spin_unlock(&fs_info->trans_lock);
}
@@ -2224,27 +2217,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
if (BTRFS_FS_ERROR(fs_info)) {
ret = -EROFS;
- goto cleanup_transaction;
+ goto lockdep_release;
}
}
+ /*
+ * Get the time spent on the work done by the commit thread and not
+ * the time spent waiting on a previous commit
+ */
+ start_time = ktime_get_ns();
+
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
ret = btrfs_run_delayed_items(trans);
if (ret)
- goto cleanup_transaction;
+ goto lockdep_release;
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
wait_event(cur_trans->writer_wait,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
ret = btrfs_run_delayed_items(trans);
- if (ret)
+ if (ret) {
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
goto cleanup_transaction;
+ }
btrfs_wait_delalloc_flush(fs_info);
@@ -2253,6 +2261,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* transaction. Otherwise if this transaction commits before the ordered
* extents complete we lose logged data after a power failure.
*/
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
wait_event(cur_trans->pending_wait,
atomic_read(&cur_trans->pending_ordered) == 0);
@@ -2266,10 +2275,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * The thread has started/joined the transaction thus it holds the
+ * lockdep map as a reader. It has to release it before acquiring the
+ * lockdep map as a writer.
+ */
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
/*
+ * Make lockdep happy by acquiring the state locks after
+ * btrfs_trans_num_writers is released. If we acquired the state locks
+ * before releasing the btrfs_trans_num_writers lock then lockdep would
+ * complain because we did not follow the reverse order unlocking rule.
+ */
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+
+ /*
* We've started the commit, clear the flag in case we were triggered to
* do an async commit but somebody else started before the transaction
* kthread could do the work.
@@ -2278,6 +2305,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
goto scrub_continue;
}
/*
@@ -2412,6 +2440,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->reloc_mutex);
wake_up(&fs_info->transaction_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
@@ -2443,6 +2472,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
btrfs_finish_extent_commit(trans);
@@ -2456,6 +2486,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
@@ -2469,6 +2500,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trace_btrfs_transaction_commit(fs_info);
+ interval = ktime_get_ns() - start_time;
+
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2476,11 +2509,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ update_commit_stats(fs_info, interval);
+
return ret;
unlock_reloc:
mutex_unlock(&fs_info->reloc_mutex);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
scrub_continue:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
btrfs_scrub_continue(fs_info);
cleanup_transaction:
btrfs_trans_release_metadata(trans);
@@ -2493,6 +2531,16 @@ cleanup_transaction:
cleanup_transaction(trans, ret);
return ret;
+
+lockdep_release:
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+ goto cleanup_transaction;
+
+lockdep_trans_commit_start_release:
+ btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+ btrfs_end_transaction(trans);
+ return ret;
}
/*
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index e56c0107eea3..43f905ab0a18 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1233,7 +1233,8 @@ static void extent_err(const struct extent_buffer *eb, int slot,
}
static int check_extent_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_extent_item *ei;
@@ -1453,6 +1454,26 @@ static int check_extent_item(struct extent_buffer *leaf,
total_refs, inline_refs);
return -EUCLEAN;
}
+
+ if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) ||
+ (prev_key->type == BTRFS_METADATA_ITEM_KEY)) {
+ u64 prev_end = prev_key->objectid;
+
+ if (prev_key->type == BTRFS_METADATA_ITEM_KEY)
+ prev_end += fs_info->nodesize;
+ else
+ prev_end += prev_key->offset;
+
+ if (unlikely(prev_end > key->objectid)) {
+ extent_err(leaf, slot,
+ "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
+ prev_key->objectid, prev_key->type,
+ prev_key->offset, key->objectid, key->type,
+ key->offset);
+ return -EUCLEAN;
+ }
+ }
+
return 0;
}
@@ -1621,7 +1642,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_EXTENT_ITEM_KEY:
case BTRFS_METADATA_ITEM_KEY:
- ret = check_extent_item(leaf, key, slot);
+ ret = check_extent_item(leaf, key, slot, prev_key);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
case BTRFS_SHARED_DATA_REF_KEY:
@@ -1855,3 +1876,58 @@ out:
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
+
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
+{
+ const bool is_subvol = is_fstree(root_owner);
+ const u64 eb_owner = btrfs_header_owner(eb);
+
+ /*
+ * Skip dummy fs, as selftests don't create unique ebs for each dummy
+ * root.
+ */
+ if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
+ return 0;
+ /*
+ * There are several call sites (backref walking, qgroup, and data
+ * reloc) passing 0 as @root_owner, as they are not holding the
+ * tree root. In that case, we can not do a reliable ownership check,
+ * so just exit.
+ */
+ if (root_owner == 0)
+ return 0;
+ /*
+ * These trees use key.offset as their owner, our callers don't have
+ * the extra capacity to pass key.offset here. So we just skip them.
+ */
+ if (root_owner == BTRFS_TREE_LOG_OBJECTID ||
+ root_owner == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!is_subvol) {
+ /* For non-subvolume trees, the eb owner should match root owner */
+ if (unlikely(root_owner != eb_owner)) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ root_owner);
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /*
+ * For subvolume trees, owners can mismatch, but they should all belong
+ * to subvolume trees.
+ */
+ if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ btrfs_crit(eb->fs_info,
+"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ root_owner, btrfs_header_bytenr(eb), eb_owner,
+ BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+ return 0;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 32fecc9dc1dd..ece497e26558 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
+int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 571dae8ad65e..813986e38258 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -22,6 +22,8 @@
#include "zoned.h"
#include "inode-item.h"
+#define MAX_CONFLICT_INODES 10
+
/* magic values for the inode_only field in btrfs_log_inode:
*
* LOG_INODE_ALL means to log everything
@@ -31,8 +33,6 @@
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
- LOG_OTHER_INODE,
- LOG_OTHER_INODE_ALL,
};
/*
@@ -171,7 +171,7 @@ again:
int index = (root->log_transid + 1) % 2;
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -194,7 +194,7 @@ again:
* writing.
*/
if (zoned && !created) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -333,7 +333,7 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
@@ -801,7 +801,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums, 0);
+ &ordered_sums, 0, false);
if (ret)
goto out;
/*
@@ -894,8 +894,7 @@ update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
- if (inode)
- iput(inode);
+ iput(inode);
return ret;
}
@@ -1064,8 +1063,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
u64 inode_objectid, u64 parent_objectid,
- u64 ref_index, char *name, int namelen,
- int *search_done)
+ u64 ref_index, char *name, int namelen)
{
int ret;
char *victim_name;
@@ -1127,19 +1125,12 @@ again:
kfree(victim_name);
if (ret)
return ret;
- *search_done = 1;
goto again;
}
kfree(victim_name);
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
}
-
- /*
- * NOTE: we have searched root tree and checked the
- * corresponding ref, it does not need to check again.
- */
- *search_done = 1;
}
btrfs_release_path(path);
@@ -1147,7 +1138,9 @@ again:
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
- if (!IS_ERR_OR_NULL(extref)) {
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
@@ -1201,14 +1194,12 @@ again:
kfree(victim_name);
if (ret)
return ret;
- *search_done = 1;
goto again;
}
kfree(victim_name);
next:
cur_offset += victim_name_len + sizeof(*extref);
}
- *search_done = 1;
}
btrfs_release_path(path);
@@ -1372,103 +1363,6 @@ again:
return ret;
}
-static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
- const u8 ref_type, const char *name,
- const int namelen)
-{
- struct btrfs_key key;
- struct btrfs_path *path;
- const u64 parent_id = btrfs_ino(BTRFS_I(dir));
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = ref_type;
- if (key.type == BTRFS_INODE_REF_KEY)
- key.offset = parent_id;
- else
- key.offset = btrfs_extref_hash(parent_id, name, namelen);
-
- ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = 0;
- goto out;
- }
- if (key.type == BTRFS_INODE_EXTREF_KEY)
- ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0], parent_id, name, namelen);
- else
- ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
- name, namelen);
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int add_link(struct btrfs_trans_handle *trans,
- struct inode *dir, struct inode *inode, const char *name,
- int namelen, u64 ref_index)
-{
- struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_dir_item *dir_item;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct inode *other_inode = NULL;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_item = btrfs_lookup_dir_item(NULL, root, path,
- btrfs_ino(BTRFS_I(dir)),
- name, namelen, 0);
- if (!dir_item) {
- btrfs_release_path(path);
- goto add_link;
- } else if (IS_ERR(dir_item)) {
- ret = PTR_ERR(dir_item);
- goto out;
- }
-
- /*
- * Our inode's dentry collides with the dentry of another inode which is
- * in the log but not yet processed since it has a higher inode number.
- * So delete that other dentry.
- */
- btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
- btrfs_release_path(path);
- other_inode = read_one_inode(root, key.objectid);
- if (!other_inode) {
- ret = -ENOENT;
- goto out;
- }
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
- if (ret)
- goto out;
- /*
- * If we dropped the link count to 0, bump it so that later the iput()
- * on the inode will not free it. We will fixup the link count later.
- */
- if (other_inode->i_nlink == 0)
- inc_nlink(other_inode);
-add_link:
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- name, namelen, 0, ref_index);
-out:
- iput(other_inode);
- btrfs_free_path(path);
-
- return ret;
-}
-
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
@@ -1489,7 +1383,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
char *name = NULL;
int namelen;
int ret;
- int search_done = 0;
int log_ref_ver = 0;
u64 parent_objectid;
u64 inode_objectid;
@@ -1564,51 +1457,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
-
- if (!search_done) {
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir),
- BTRFS_I(inode),
- inode_objectid,
- parent_objectid,
- ref_index, name, namelen,
- &search_done);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto out;
- }
- }
-
- /*
- * If a reference item already exists for this inode
- * with the same parent and name, but different index,
- * drop it and the corresponding directory index entries
- * from the parent before adding the new reference item
- * and dir index entries, otherwise we would fail with
- * -EEXIST returned from btrfs_add_link() below.
- */
- ret = btrfs_inode_ref_exists(inode, dir, key->type,
- name, namelen);
- if (ret > 0) {
- ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(dir),
- BTRFS_I(inode),
- name, namelen);
- /*
- * If we dropped the link count to 0, bump it so
- * that later the iput() on the inode will not
- * free it. We will fixup the link count later.
- */
- if (!ret && inode->i_nlink == 0)
- inc_nlink(inode);
- }
- if (ret < 0)
+ ret = __add_inode_ref(trans, root, path, log,
+ BTRFS_I(dir), BTRFS_I(inode),
+ inode_objectid, parent_objectid,
+ ref_index, name, namelen);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out;
+ }
/* insert our name */
- ret = add_link(trans, dir, inode, name, namelen,
- ref_index);
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, namelen, 0, ref_index);
if (ret)
goto out;
@@ -2288,7 +2149,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_key location;
/*
- * Currenly we only log dir index keys. Even if we replay a log created
+ * Currently we only log dir index keys. Even if we replay a log created
* by an older kernel that logged both dir index and dir item keys, all
* we need to do is process the dir index keys, we (and our caller) can
* safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
@@ -2575,7 +2436,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
int i;
int ret;
- ret = btrfs_read_buffer(eb, gen, level, NULL);
+ ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@@ -2786,7 +2647,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen,
+ ret = btrfs_read_extent_buffer(next, ptr_gen,
*level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
@@ -2815,7 +2676,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
+ ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -3122,7 +2983,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -3188,6 +3049,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
goto out;
}
}
@@ -3222,7 +3084,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -3261,7 +3123,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = -EAGAIN;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_wake_log_root;
}
@@ -3720,11 +3582,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
key.offset = first_offset;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- if (ret)
+ /*
+ * -EEXIST is fine and can happen sporadically when we are logging a
+ * directory and have concurrent insertions in the subvolume's tree for
+ * items from other inodes and that result in pushing off some dir items
+ * from one leaf to another in order to accommodate for the new items.
+ * This results in logging the same dir index range key.
+ */
+ if (ret && ret != -EEXIST)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
+ if (ret == -EEXIST) {
+ const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
+
+ /*
+ * btrfs_del_dir_entries_in_log() might have been called during
+ * an unlink between the initial insertion of this key and the
+ * current update, or we might be logging a single entry deletion
+ * during a rename, so set the new last_offset to the max value.
+ */
+ last_offset = max(last_offset, curr_end);
+ }
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
@@ -3848,13 +3728,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
ret = insert_dir_log_key(trans, log, dst_path,
ino, *last_old_dentry_offset + 1,
key.offset - 1);
- /*
- * -EEXIST should never happen because when we
- * log a directory in full mode (LOG_INODE_ALL)
- * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from
- * the log tree.
- */
- ASSERT(ret != -EEXIST);
if (ret < 0)
return ret;
}
@@ -3862,6 +3735,11 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
*last_old_dentry_offset = key.offset;
continue;
}
+
+ /* If we logged this dir index item before, we can skip it. */
+ if (key.offset <= inode->last_dir_index_offset)
+ continue;
+
/*
* We must make sure that when we log a directory entry, the
* corresponding inode, after log replay, has a matching link
@@ -3892,51 +3770,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
ctx->log_new_dentries = true;
}
- if (!ctx->logged_before)
- goto add_to_batch;
-
- /*
- * If we were logged before and have logged dir items, we can skip
- * checking if any item with a key offset larger than the last one
- * we logged is in the log tree, saving time and avoiding adding
- * contention on the log tree. We can only rely on the value of
- * last_dir_index_offset when we know for sure that the inode was
- * previously logged in the current transaction.
- */
- if (key.offset > inode->last_dir_index_offset)
- goto add_to_batch;
- /*
- * Check if the key was already logged before. If not we can add
- * it to a batch for bulk insertion.
- */
- ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- btrfs_release_path(dst_path);
- goto add_to_batch;
- }
-
- /*
- * Item exists in the log. Overwrite the item in the log if it
- * has different content or do nothing if it has exactly the same
- * content. And then flush the current batch if any - do it after
- * overwriting the current item, or we would deadlock otherwise,
- * since we are holding a path for the existing item.
- */
- ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
- if (ret < 0)
- return ret;
-
- if (batch_size > 0) {
- ret = flush_dir_items_batch(trans, log, src, dst_path,
- batch_start, batch_size);
- if (ret < 0)
- return ret;
- batch_size = 0;
- }
- continue;
-add_to_batch:
if (batch_size == 0)
batch_start = i;
batch_size++;
@@ -4123,6 +3956,71 @@ done:
}
/*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_key key;
+ int ret;
+
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (inode->last_dir_index_offset != (u64)-1)
+ return 0;
+
+ if (!ctx->logged_before) {
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+ return 0;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+ /*
+ * An error happened or we actually have an index key with an offset
+ * value of (u64)-1. Bail out, we're done.
+ */
+ if (ret <= 0)
+ goto out;
+
+ ret = 0;
+ inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+ /*
+ * No dir index items, bail out and leave last_dir_index_offset with
+ * the value right before the first valid index value.
+ */
+ if (path->slots[0] == 0)
+ goto out;
+
+ /*
+ * btrfs_search_slot() left us at one slot beyond the slot with the last
+ * index key, or beyond the last key of the directory that is not an
+ * index key. If we have an index key before, set last_dir_index_offset
+ * to its offset value, otherwise leave it with a value right before the
+ * first valid index value, as it means we have an empty directory.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+ inode->last_dir_index_offset = key.offset;
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
+/*
* logging directories is very similar to logging inodes, We find all the items
* from the current transaction and write them to the log.
*
@@ -4144,6 +4042,10 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 max_key;
int ret;
+ ret = update_last_dir_index_offset(inode, path, ctx);
+ if (ret)
+ return ret;
+
min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
ctx->last_dir_item_offset = inode->last_dir_index_offset;
@@ -4369,8 +4271,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
- lock_end, &cached_state);
+ ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
if (ret)
return ret;
/*
@@ -4386,8 +4288,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
- &cached_state);
+ unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+ &cached_state);
return ret;
}
@@ -4500,7 +4402,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
disk_bytenr += extent_offset;
ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
disk_bytenr + extent_num_bytes - 1,
- &ordered_sums, 0);
+ &ordered_sums, 0, false);
if (ret)
goto out;
@@ -4696,7 +4598,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, 0);
+ csum_len - 1, &ordered_sums, 0, false);
if (ret)
return ret;
@@ -5208,10 +5110,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
* leafs from the log root.
*/
btrfs_release_path(path);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0,
- 0, hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root,
+ ino, prev_extent_end,
+ hole_len);
if (ret < 0)
return ret;
@@ -5240,10 +5141,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, root->log_root,
- ino, prev_extent_end, 0, 0,
- hole_len, 0, hole_len,
- 0, 0, 0);
+ ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+ prev_extent_end, hole_len);
if (ret < 0)
return ret;
}
@@ -5386,111 +5285,461 @@ out:
return ret;
}
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+ const struct btrfs_inode *inode)
+{
+ /*
+ * If a directory was not modified, no dentries added or removed, we can
+ * and should avoid logging it.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+ return false;
+
+ /*
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
+ */
+ if (inode->logged_trans == trans->transid &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
+
+ return true;
+}
+
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new index items (key type
+ * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names - this is ok, not a problem, because at log replay time we set the
+ * directory's i_size to the correct value (see replay_one_name() and
+ * do_overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *root = start_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ u64 ino = btrfs_ino(start_inode);
+ int ret = 0;
+
+ /*
+ * If we are logging a new name, as part of a link or rename operation,
+ * don't bother logging new dentries, as we just want to log the names
+ * of an inode and that any new parents exist.
+ */
+ if (ctx->logging_new_name)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ while (true) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ bool continue_curr_inode = true;
+ int nritems;
+ int i;
+
+ min_key.objectid = ino;
+ min_key.type = BTRFS_DIR_INDEX_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+ if (ret < 0) {
+ break;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next;
+ }
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != ino ||
+ min_key.type != BTRFS_DIR_INDEX_KEY) {
+ continue_curr_inode = false;
+ break;
+ }
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ btrfs_release_path(path);
+ di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto out;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ break;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
+ log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
+ if (ret)
+ goto out;
+ if (ctx->log_new_dentries) {
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ dir_elem->ino = di_key.objectid;
+ list_add_tail(&dir_elem->list, &dir_list);
+ }
+ break;
+ }
+
+ if (continue_curr_inode && min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+
+next:
+ if (list_empty(&dir_list))
+ break;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+ ino = dir_elem->ino;
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+out:
+ btrfs_free_path(path);
+ if (ret) {
+ struct btrfs_dir_list *next;
+
+ list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+ kfree(dir_elem);
+ }
+
+ return ret;
+}
+
struct btrfs_ino_list {
u64 ino;
u64 parent;
struct list_head list;
};
-static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- u64 ino, u64 parent)
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ino_list *curr;
+ struct btrfs_ino_list *next;
+
+ list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+ list_del(&curr->list);
+ kfree(curr);
+ }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+ struct btrfs_path *path)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (WARN_ON_ONCE(ret > 0)) {
+ /*
+ * We have previously found the inode through the commit root
+ * so this should not happen. If it does, just error out and
+ * fallback to a transaction commit.
+ */
+ ret = -ENOENT;
+ } else if (ret == 0) {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+ ret = 1;
+ }
+
+ btrfs_release_path(path);
+ path->search_commit_root = 0;
+ path->skip_locking = 0;
+
+ return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 parent,
+ struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- LIST_HEAD(inode_list);
- int ret = 0;
+ struct inode *inode;
+
+ /*
+ * It's rare to have a lot of conflicting inodes, in practice it is not
+ * common to have more than 1 or 2. We don't want to collect too many,
+ * as we could end up logging too many inodes (even if only in
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ return BTRFS_LOG_FORCE_COMMIT;
+
+ inode = btrfs_iget(root->fs_info->sb, ino, root);
+ /*
+ * If the other inode that had a conflicting dir entry was deleted in
+ * the current transaction then we either:
+ *
+ * 1) Log the parent directory (later after adding it to the list) if
+ * the inode is a directory. This is because it may be a deleted
+ * subvolume/snapshot or it may be a regular directory that had
+ * deleted subvolumes/snapshots (or subdirectories that had them),
+ * and at the moment we can't deal with dropping subvolumes/snapshots
+ * during log replay. So we just log the parent, which will result in
+ * a fallback to a transaction commit if we are dealing with those
+ * cases (last_unlink_trans will match the current transaction);
+ *
+ * 2) Do nothing if it's not a directory. During log replay we simply
+ * unlink the conflicting dentry from the parent directory and then
+ * add the dentry for our inode. Like this we can avoid logging the
+ * parent directory (and maybe fallback to a transaction commit in
+ * case it has a last_unlink_trans == trans->transid, due to moving
+ * some inode from it to some other directory).
+ */
+ if (IS_ERR(inode)) {
+ int ret = PTR_ERR(inode);
+
+ if (ret != -ENOENT)
+ return ret;
+
+ ret = conflicting_inode_is_dir(root, ino, path);
+ /* Not a directory or we got an error. */
+ if (ret <= 0)
+ return ret;
+
+ /* Conflicting inode is a directory, so we'll log its parent. */
+ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+ if (!ino_elem)
+ return -ENOMEM;
+ ino_elem->ino = ino;
+ ino_elem->parent = parent;
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
+
+ return 0;
+ }
+
+ /*
+ * If the inode was already logged skip it - otherwise we can hit an
+ * infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the following
+ * inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ *
+ * Here we can use need_log_inode() because we only need to log the
+ * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+ * so that the log ends up with the new name and without the old name.
+ */
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
+ btrfs_add_delayed_iput(inode);
+ return 0;
+ }
+
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM;
ino_elem->ino = ino;
ino_elem->parent = parent;
- list_add_tail(&ino_elem->list, &inode_list);
+ list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+ ctx->num_conflict_inodes++;
- while (!list_empty(&inode_list)) {
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct inode *inode;
+ return 0;
+}
- ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
- list);
- ino = ino_elem->ino;
- parent = ino_elem->parent;
- list_del(&ino_elem->list);
- kfree(ino_elem);
- if (ret)
- continue;
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
- btrfs_release_path(path);
+ /*
+ * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+ * otherwise we could have unbounded recursion of btrfs_log_inode()
+ * calls. This check guarantees we can have only 1 level of recursion.
+ */
+ if (ctx->logging_conflict_inodes)
+ return 0;
+
+ ctx->logging_conflict_inodes = true;
+
+ /*
+ * New conflicting inodes may be found and added to the list while we
+ * are logging a conflicting inode, so keep iterating while the list is
+ * not empty.
+ */
+ while (!list_empty(&ctx->conflict_inodes)) {
+ struct btrfs_ino_list *curr;
+ struct inode *inode;
+ u64 ino;
+ u64 parent;
+
+ curr = list_first_entry(&ctx->conflict_inodes,
+ struct btrfs_ino_list, list);
+ ino = curr->ino;
+ parent = curr->parent;
+ list_del(&curr->list);
+ kfree(curr);
inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
- * directory.
+ * directory. See the comment at add_conflicting_inode().
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- if (ret == -ENOENT) {
- inode = btrfs_iget(fs_info->sb, parent, root);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- } else {
- ret = btrfs_log_inode(trans,
- BTRFS_I(inode),
- LOG_OTHER_INODE_ALL,
- ctx);
- btrfs_add_delayed_iput(inode);
- }
+ if (ret != -ENOENT)
+ break;
+
+ inode = btrfs_iget(fs_info->sb, parent, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ break;
}
+
+ /*
+ * Always log the directory, we cannot make this
+ * conditional on need_log_inode() because the directory
+ * might have been logged in LOG_INODE_EXISTS mode or
+ * the dir index of the conflicting inode is not in a
+ * dir index key range logged for the directory. So we
+ * must make sure the deletion is recorded.
+ */
+ ret = btrfs_log_inode(trans, BTRFS_I(inode),
+ LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
continue;
}
+
/*
- * If the inode was already logged skip it - otherwise we can
- * hit an infinite loop. Example:
- *
- * From the commit root (previous transaction) we have the
- * following inodes:
+ * Here we can use need_log_inode() because we only need to log
+ * the inode in LOG_INODE_EXISTS mode and rename operations
+ * update the log, so that the log ends up with the new name and
+ * without the old name.
*
- * inode 257 a directory
- * inode 258 with references "zz" and "zz_link" on inode 257
- * inode 259 with reference "a" on inode 257
- *
- * And in the current (uncommitted) transaction we have:
- *
- * inode 257 a directory, unchanged
- * inode 258 with references "a" and "a2" on inode 257
- * inode 259 with reference "zz_link" on inode 257
- * inode 261 with reference "zz" on inode 257
- *
- * When logging inode 261 the following infinite loop could
- * happen if we don't skip already logged inodes:
- *
- * - we detect inode 258 as a conflicting inode, with inode 261
- * on reference "zz", and log it;
- *
- * - we detect inode 259 as a conflicting inode, with inode 258
- * on reference "a", and log it;
- *
- * - we detect inode 258 as a conflicting inode, with inode 259
- * on reference "zz_link", and log it - again! After this we
- * repeat the above steps forever.
+ * We did this check at add_conflicting_inode(), but here we do
+ * it again because if some other task logged the inode after
+ * that, we can avoid doing it again.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- /*
- * Check the inode's logged_trans only instead of
- * btrfs_inode_in_log(). This is because the last_log_commit of
- * the inode is not updated when we only log that it exists (see
- * btrfs_log_inode()).
- */
- if (BTRFS_I(inode)->logged_trans == trans->transid) {
- spin_unlock(&BTRFS_I(inode)->lock);
+ if (!need_log_inode(trans, BTRFS_I(inode))) {
btrfs_add_delayed_iput(inode);
continue;
}
- spin_unlock(&BTRFS_I(inode)->lock);
+
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
@@ -5498,67 +5747,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
- if (ret) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- key.objectid = ino;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- btrfs_add_delayed_iput(inode);
- continue;
- }
-
- while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- break;
- } else if (ret > 0) {
- ret = 0;
- break;
- }
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != ino ||
- (key.type != BTRFS_INODE_REF_KEY &&
- key.type != BTRFS_INODE_EXTREF_KEY)) {
- ret = 0;
- break;
- }
-
- ret = btrfs_check_ref_name_override(leaf, slot, &key,
- BTRFS_I(inode), &other_ino,
- &other_parent);
- if (ret < 0)
- break;
- if (ret > 0) {
- ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
- if (!ino_elem) {
- ret = -ENOMEM;
- break;
- }
- ino_elem->ino = other_ino;
- ino_elem->parent = other_parent;
- list_add_tail(&ino_elem->list, &inode_list);
- ret = 0;
- }
- path->slots[0]++;
- }
+ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
+ if (ret)
+ break;
}
+ ctx->logging_conflict_inodes = false;
+ if (ret)
+ free_conflicting_inodes(ctx);
+
return ret;
}
@@ -5569,7 +5767,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
- const bool recursive_logging,
const int inode_only,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
@@ -5608,8 +5805,8 @@ again:
break;
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ (inode->generation == trans->transid ||
+ ctx->logging_conflict_inodes)) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5633,11 +5830,12 @@ again:
return ret;
ins_nr = 0;
- ret = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
+ btrfs_release_path(path);
+ ret = add_conflicting_inode(trans, root, path,
+ other_ino,
+ other_parent, ctx);
if (ret)
return ret;
- btrfs_release_path(path);
goto next_key;
}
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
@@ -5720,6 +5918,371 @@ next_key:
return ret;
}
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const struct btrfs_item_batch *batch,
+ const struct btrfs_delayed_item *first_item)
+{
+ const struct btrfs_delayed_item *curr = first_item;
+ int ret;
+
+ ret = btrfs_insert_empty_items(trans, log, path, batch);
+ if (ret)
+ return ret;
+
+ for (int i = 0; i < batch->nr; i++) {
+ char *data_ptr;
+
+ data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+ write_extent_buffer(path->nodes[0], &curr->data,
+ (unsigned long)data_ptr, curr->data_len);
+ curr = list_next_entry(curr, log_list);
+ path->slots[0]++;
+ }
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+ const int max_batch_size = 195;
+ const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *log = inode->root->log_root;
+ struct btrfs_item_batch batch = {
+ .nr = 0,
+ .total_data_size = 0,
+ };
+ const struct btrfs_delayed_item *first = NULL;
+ const struct btrfs_delayed_item *curr;
+ char *ins_data;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+ u64 curr_batch_size = 0;
+ int batch_idx = 0;
+ int ret;
+
+ /* We are adding dir index items to the log tree. */
+ lockdep_assert_held(&inode->log_mutex);
+
+ /*
+ * We collect delayed items before copying index keys from the subvolume
+ * to the log tree. However just after we collected them, they may have
+ * been flushed (all of them or just some of them), and therefore we
+ * could have copied them from the subvolume tree to the log tree.
+ * So find the first delayed item that was not yet logged (they are
+ * sorted by index number).
+ */
+ list_for_each_entry(curr, delayed_ins_list, log_list) {
+ if (curr->index > inode->last_dir_index_offset) {
+ first = curr;
+ break;
+ }
+ }
+
+ /* Empty list or all delayed items were already logged. */
+ if (!first)
+ return 0;
+
+ ins_data = kmalloc(max_batch_size * sizeof(u32) +
+ max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+ if (!ins_data)
+ return -ENOMEM;
+ ins_sizes = (u32 *)ins_data;
+ batch.data_sizes = ins_sizes;
+ ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+ batch.keys = ins_keys;
+
+ curr = first;
+ while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+ const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+ if (curr_batch_size + curr_size > leaf_data_size ||
+ batch.nr == max_batch_size) {
+ ret = insert_delayed_items_batch(trans, log, path,
+ &batch, first);
+ if (ret)
+ goto out;
+ batch_idx = 0;
+ batch.nr = 0;
+ batch.total_data_size = 0;
+ curr_batch_size = 0;
+ first = curr;
+ }
+
+ ins_sizes[batch_idx] = curr->data_len;
+ ins_keys[batch_idx].objectid = ino;
+ ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+ ins_keys[batch_idx].offset = curr->index;
+ curr_batch_size += curr_size;
+ batch.total_data_size += curr->data_len;
+ batch.nr++;
+ batch_idx++;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ ASSERT(batch.nr >= 1);
+ ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+ curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+ log_list);
+ inode->last_dir_index_offset = curr->index;
+out:
+ kfree(ins_data);
+
+ return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const u64 ino = btrfs_ino(inode);
+ const struct btrfs_delayed_item *curr;
+
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ const struct btrfs_delayed_item *next;
+ int ret;
+
+ /*
+ * Find a range of consecutive dir index items to delete. Like
+ * this we log a single dir range item spanning several contiguous
+ * dir items instead of logging one range item per dir index item.
+ */
+ next = list_next_entry(curr, log_list);
+ while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+ if (next->index != curr->index + 1)
+ break;
+ curr = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ last_dir_index = curr->index;
+ ASSERT(last_dir_index >= first_dir_index);
+
+ ret = insert_dir_log_key(trans, inode->root->log_root, path,
+ ino, first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+ curr = list_next_entry(curr, log_list);
+ }
+
+ return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_log_ctx *ctx,
+ const struct list_head *delayed_del_list,
+ const struct btrfs_delayed_item *first,
+ const struct btrfs_delayed_item **last_ret)
+{
+ const struct btrfs_delayed_item *next;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int last_slot = btrfs_header_nritems(leaf) - 1;
+ int slot = path->slots[0] + 1;
+ const u64 ino = btrfs_ino(inode);
+
+ next = list_next_entry(first, log_list);
+
+ while (slot < last_slot &&
+ !list_entry_is_head(next, delayed_del_list, log_list)) {
+ struct btrfs_key key;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino ||
+ key.type != BTRFS_DIR_INDEX_KEY ||
+ key.offset != next->index)
+ break;
+
+ slot++;
+ *last_ret = next;
+ next = list_next_entry(next, log_list);
+ }
+
+ return btrfs_del_items(trans, inode->root->log_root, path,
+ path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = inode->root->log_root;
+ const struct btrfs_delayed_item *curr;
+ u64 last_range_start;
+ u64 last_range_end = 0;
+ struct btrfs_key key;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+ log_list);
+
+ while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+ const struct btrfs_delayed_item *last = curr;
+ u64 first_dir_index = curr->index;
+ u64 last_dir_index;
+ bool deleted_items = false;
+ int ret;
+
+ key.offset = curr->index;
+ ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ delayed_del_list, curr,
+ &last);
+ if (ret)
+ return ret;
+ deleted_items = true;
+ }
+
+ btrfs_release_path(path);
+
+ /*
+ * If we deleted items from the leaf, it means we have a range
+ * item logging their range, so no need to add one or update an
+ * existing one. Otherwise we have to log a dir range item.
+ */
+ if (deleted_items)
+ goto next_batch;
+
+ last_dir_index = last->index;
+ ASSERT(last_dir_index >= first_dir_index);
+ /*
+ * If this range starts right after where the previous one ends,
+ * then we want to reuse the previous range item and change its
+ * end offset to the end of this range. This is just to minimize
+ * leaf space usage, by avoiding adding a new range item.
+ */
+ if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+ first_dir_index = last_range_start;
+
+ ret = insert_dir_log_key(trans, log, path, key.objectid,
+ first_dir_index, last_dir_index);
+ if (ret)
+ return ret;
+
+ last_range_start = first_dir_index;
+ last_range_end = last_dir_index;
+next_batch:
+ curr = list_next_entry(last, log_list);
+ }
+
+ return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ const struct list_head *delayed_del_list,
+ struct btrfs_log_ctx *ctx)
+{
+ /*
+ * We are deleting dir index items from the log tree or adding range
+ * items to it.
+ */
+ lockdep_assert_held(&inode->log_mutex);
+
+ if (list_empty(delayed_del_list))
+ return 0;
+
+ if (ctx->logged_before)
+ return log_delayed_deletions_incremental(trans, inode, path,
+ delayed_del_list, ctx);
+
+ return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+ ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ const struct list_head *delayed_ins_list,
+ struct btrfs_log_ctx *ctx)
+{
+ const bool orig_log_new_dentries = ctx->log_new_dentries;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_item *item;
+ int ret = 0;
+
+ /*
+ * No need for the log mutex, plus to avoid potential deadlocks or
+ * lockdep annotations due to nesting of delayed inode mutexes and log
+ * mutexes.
+ */
+ lockdep_assert_not_held(&inode->log_mutex);
+
+ ASSERT(!ctx->logging_new_delayed_dentries);
+ ctx->logging_new_delayed_dentries = true;
+
+ list_for_each_entry(item, delayed_ins_list, log_list) {
+ struct btrfs_dir_item *dir_item;
+ struct inode *di_inode;
+ struct btrfs_key key;
+ int log_mode = LOG_INODE_EXISTS;
+
+ dir_item = (struct btrfs_dir_item *)item->data;
+ btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+ if (key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ break;
+ }
+
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+ btrfs_add_delayed_iput(di_inode);
+ continue;
+ }
+
+ if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+
+ ctx->log_new_dentries = false;
+ ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+
+ if (!ret && ctx->log_new_dentries)
+ ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+
+ btrfs_add_delayed_iput(di_inode);
+
+ if (ret)
+ break;
+ }
+
+ ctx->log_new_dentries = orig_log_new_dentries;
+ ctx->logging_new_delayed_dentries = false;
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -5751,9 +6314,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
- bool recursive_logging = false;
bool inode_item_dropped = true;
- const bool orig_logged_before = ctx->logged_before;
+ bool full_dir_logging = false;
+ LIST_HEAD(delayed_ins_list);
+ LIST_HEAD(delayed_del_list);
path = btrfs_alloc_path();
if (!path)
@@ -5781,27 +6345,58 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+ full_dir_logging = true;
+
/*
- * Only run delayed items if we are a directory. We want to make sure
- * all directory indexes hit the fs/subvolume tree so we can find them
- * and figure out which index ranges have to be logged.
+ * If we are logging a directory while we are logging dentries of the
+ * delayed items of some other inode, then we need to flush the delayed
+ * items of this directory and not log the delayed items directly. This
+ * is to prevent more than one level of recursion into btrfs_log_inode()
+ * by having something like this:
+ *
+ * $ mkdir -p a/b/c/d/e/f/g/h/...
+ * $ xfs_io -c "fsync" a
+ *
+ * Where all directories in the path did not exist before and are
+ * created in the current transaction.
+ * So in such a case we directly log the delayed items of the main
+ * directory ("a") without flushing them first, while for each of its
+ * subdirectories we flush their delayed items before logging them.
+ * This prevents a potential unbounded recursion like this:
+ *
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * btrfs_log_inode()
+ * log_new_delayed_dentries()
+ * (...)
+ *
+ * We have thresholds for the maximum number of delayed items to have in
+ * memory, and once they are hit, the items are flushed asynchronously.
+ * However the limit is quite high, so lets prevent deep levels of
+ * recursion to happen by limiting the maximum depth to be 1.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ if (full_dir_logging && ctx->logging_new_delayed_dentries) {
ret = btrfs_commit_inode_delayed_items(trans, inode);
if (ret)
goto out;
}
- if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
- recursive_logging = true;
- if (inode_only == LOG_OTHER_INODE)
- inode_only = LOG_INODE_EXISTS;
- else
- inode_only = LOG_INODE_ALL;
- mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
- } else {
- mutex_lock(&inode->log_mutex);
- }
+ mutex_lock(&inode->log_mutex);
+
+ /*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
/*
* Before logging the inode item, cache the value returned by
@@ -5821,11 +6416,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* to known the file was moved from A to B, so logging just A would
* result in losing the file after a log replay.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) &&
- inode_only == LOG_INODE_ALL &&
- inode->last_unlink_trans >= trans->transid) {
+ if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto out_unlock;
}
@@ -5834,14 +6427,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
if (ctx->logged_before)
ret = drop_inode_items(trans, log, path, inode,
- max_key_type);
+ BTRFS_XATTR_ITEM_KEY);
} else {
if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
@@ -5897,9 +6486,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (ret)
goto out_unlock;
+ /*
+ * If we are logging a directory in full mode, collect the delayed items
+ * before iterating the subvolume tree, so that we don't miss any new
+ * dir index items in case they get flushed while or right after we are
+ * iterating the subvolume tree.
+ */
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+ btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
+
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
- recursive_logging, inode_only, ctx,
+ inode_only, ctx,
&need_log_inode_item);
if (ret)
goto out_unlock;
@@ -5952,10 +6551,18 @@ log_extents:
write_unlock(&em_tree->lock);
}
- if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
+ if (full_dir_logging) {
ret = log_directory_changes(trans, inode, path, dst_path, ctx);
if (ret)
goto out_unlock;
+ ret = log_delayed_insertion_items(trans, inode, path,
+ &delayed_ins_list, ctx);
+ if (ret)
+ goto out_unlock;
+ ret = log_delayed_deletion_items(trans, inode, path,
+ &delayed_del_list, ctx);
+ if (ret)
+ goto out_unlock;
}
spin_lock(&inode->lock);
@@ -6008,208 +6615,20 @@ out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
- if (recursive_logging)
- ctx->logged_before = orig_logged_before;
-
- return ret;
-}
-
-/*
- * Check if we need to log an inode. This is used in contexts where while
- * logging an inode we need to log another inode (either that it exists or in
- * full mode). This is used instead of btrfs_inode_in_log() because the later
- * requires the inode to be in the log and have the log transaction committed,
- * while here we do not care if the log transaction was already committed - our
- * caller will commit the log later - and we want to avoid logging an inode
- * multiple times when multiple tasks have joined the same log transaction.
- */
-static bool need_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
-{
- /*
- * If a directory was not modified, no dentries added or removed, we can
- * and should avoid logging it.
- */
- if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
- return false;
-
- /*
- * If this inode does not have new/updated/deleted xattrs since the last
- * time it was logged and is flagged as logged in the current transaction,
- * we can skip logging it. As for new/deleted names, those are updated in
- * the log by link/unlink/rename operations.
- * In case the inode was logged and then evicted and reloaded, its
- * logged_trans will be 0, in which case we have to fully log it since
- * logged_trans is a transient field, not persisted.
- */
- if (inode->logged_trans == trans->transid &&
- !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
- return false;
-
- return true;
-}
-
-struct btrfs_dir_list {
- u64 ino;
- struct list_head list;
-};
-
-/*
- * Log the inodes of the new dentries of a directory. See log_dir_items() for
- * details about the why it is needed.
- * This is a recursive operation - if an existing dentry corresponds to a
- * directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
- * the dentries point to we do not lock their i_mutex, otherwise lockdep
- * complains about the following circular lock dependency / possible deadlock:
- *
- * CPU0 CPU1
- * ---- ----
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(sb_internal#2);
- * lock(&type->i_mutex_dir_key#3/2);
- * lock(&sb->s_type->i_mutex_key#14);
- *
- * Where sb_internal is the lock (a counter that works as a lock) acquired by
- * sb_start_intwrite() in btrfs_start_transaction().
- * Not locking i_mutex of the inodes is still safe because:
- *
- * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
- * that while logging the inode new references (names) are added or removed
- * from the inode, leaving the logged inode item with a link count that does
- * not match the number of logged inode reference items. This is fine because
- * at log replay time we compute the real number of links and correct the
- * link count in the inode item (see replay_one_buffer() and
- * link_to_fixup_dir());
- *
- * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- * while logging the inode's items new index items (key type
- * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
- * has a size that doesn't match the sum of the lengths of all the logged
- * names - this is ok, not a problem, because at log replay time we set the
- * directory's i_size to the correct value (see replay_one_name() and
- * do_overwrite_item()).
- */
-static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *start_inode,
- struct btrfs_log_ctx *ctx)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
- LIST_HEAD(dir_list);
- struct btrfs_dir_list *dir_elem;
- int ret = 0;
-
- /*
- * If we are logging a new name, as part of a link or rename operation,
- * don't bother logging new dentries, as we just want to log the names
- * of an inode and that any new parents exist.
- */
- if (ctx->logging_new_name)
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
- if (!dir_elem) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
- dir_elem->ino = btrfs_ino(start_inode);
- list_add_tail(&dir_elem->list, &dir_list);
-
- while (!list_empty(&dir_list)) {
- struct extent_buffer *leaf;
- struct btrfs_key min_key;
- int nritems;
- int i;
-
- dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
- list);
- if (ret)
- goto next_dir_inode;
-
- min_key.objectid = dir_elem->ino;
- min_key.type = BTRFS_DIR_INDEX_KEY;
- min_key.offset = 0;
-again:
- btrfs_release_path(path);
- ret = btrfs_search_forward(root, &min_key, path, trans->transid);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- for (i = path->slots[0]; i < nritems; i++) {
- struct btrfs_dir_item *di;
- struct btrfs_key di_key;
- struct inode *di_inode;
- struct btrfs_dir_list *new_dir_elem;
- int log_mode = LOG_INODE_EXISTS;
- int type;
-
- btrfs_item_key_to_cpu(leaf, &min_key, i);
- if (min_key.objectid != dir_elem->ino ||
- min_key.type != BTRFS_DIR_INDEX_KEY)
- goto next_dir_inode;
-
- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
- type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid)
- continue;
- btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
- if (di_key.type == BTRFS_ROOT_ITEM_KEY)
- continue;
-
- btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
- if (IS_ERR(di_inode)) {
- ret = PTR_ERR(di_inode);
- goto next_dir_inode;
- }
+ if (ret)
+ free_conflicting_inodes(ctx);
+ else
+ ret = log_conflicting_inodes(trans, inode->root, ctx);
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(di_inode);
- break;
- }
+ if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+ if (!ret)
+ ret = log_new_delayed_dentries(trans, inode,
+ &delayed_ins_list, ctx);
- ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
- log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(di_inode);
- if (ret)
- goto next_dir_inode;
- if (ctx->log_new_dentries) {
- new_dir_elem = kmalloc(sizeof(*new_dir_elem),
- GFP_NOFS);
- if (!new_dir_elem) {
- ret = -ENOMEM;
- goto next_dir_inode;
- }
- new_dir_elem->ino = di_key.objectid;
- list_add_tail(&new_dir_elem->list, &dir_list);
- }
- break;
- }
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- goto again;
- }
-next_dir_inode:
- list_del(&dir_elem->list);
- kfree(dir_elem);
+ btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+ &delayed_del_list);
}
- btrfs_free_path(path);
return ret;
}
@@ -6321,7 +6740,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, root,
+ ret = log_new_dir_dentries(trans,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
@@ -6539,12 +6958,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
bool log_dentries = false;
if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
if (btrfs_root_refs(&root->root_item) == 0) {
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
goto end_no_trans;
}
@@ -6636,13 +7055,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
if (log_dentries)
- ret = log_new_dir_dentries(trans, root, inode, ctx);
+ ret = log_new_dir_dentries(trans, inode, ctx);
else
ret = 0;
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
- ret = 1;
+ ret = BTRFS_LOG_FORCE_COMMIT;
}
if (ret)
@@ -7006,8 +7425,15 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* anyone from syncing the log until we have updated both inodes
* in the log.
*/
+ ret = join_running_log_trans(root);
+ /*
+ * At least one of the inodes was logged before, so this should
+ * not fail, but if it does, it's not serious, just bail out and
+ * mark the log for a full commit.
+ */
+ if (WARN_ON_ONCE(ret < 0))
+ goto out;
log_pinned = true;
- btrfs_pin_log_trans(root);
path = btrfs_alloc_path();
if (!path) {
@@ -7018,12 +7444,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
/*
* Other concurrent task might be logging the old directory,
* as it can be triggered when logging other inode that had or
- * still has a dentry in the old directory. So take the old
- * directory's log_mutex to prevent getting an -EEXIST when
- * logging a key to record the deletion, or having that other
- * task logging the old directory get an -EEXIST if it attempts
- * to log the same key after we just did it. In both cases that
- * would result in falling back to a transaction commit.
+ * still has a dentry in the old directory. We lock the old
+ * directory's log_mutex to ensure the deletion of the old
+ * name is persisted, because during directory logging we
+ * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
+ * the old name's dir index item is in the delayed items, so
+ * it could be missed by an in progress directory logging.
*/
mutex_lock(&old_dir->log_mutex);
ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
@@ -7056,6 +7482,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+ ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
* If an error happened mark the log for a full commit because it's not
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1620f8170629..aed1e05e9879 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -12,11 +12,15 @@
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+/* We can't use the tree log for whatever reason, force a transaction commit */
+#define BTRFS_LOG_FORCE_COMMIT (1)
+
struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ bool logging_new_delayed_dentries;
/* Indicate if the inode being logged was logged before. */
bool logged_before;
/* Tracks the last logged dir item/index key offset. */
@@ -25,6 +29,9 @@ struct btrfs_log_ctx {
struct list_head list;
/* Only used for fast fsyncs. */
struct list_head ordered_extents;
+ struct list_head conflict_inodes;
+ int num_conflict_inodes;
+ bool logging_conflict_inodes;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -34,10 +41,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_transid = 0;
ctx->log_new_dentries = false;
ctx->logging_new_name = false;
+ ctx->logging_new_delayed_dentries = false;
ctx->logged_before = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
INIT_LIST_HEAD(&ctx->ordered_extents);
+ INIT_LIST_HEAD(&ctx->conflict_inodes);
+ ctx->num_conflict_inodes = 0;
+ ctx->logging_conflict_inodes = false;
}
static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 90eb5c2830a9..ee00e33c309e 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -659,8 +659,7 @@ rollback:
*
* Returns the size on success or a negative error code on failure.
*/
-static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
- size_t buf_size)
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
{
u64 true_size;
int ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1be7cb2f955f..94ba46d57920 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -34,6 +34,8 @@
#include "discard.h"
#include "zoned.h"
+static struct bio_set btrfs_bioset;
+
#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -164,24 +166,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
*/
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
- return BTRFS_RAID_RAID1C3;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
- return BTRFS_RAID_RAID1C4;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+ const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+ if (!profile)
+ return BTRFS_RAID_SINGLE;
+
+ return BTRFS_BG_FLAG_TO_INDEX(profile);
}
const char *btrfs_bg_type_to_raid_name(u64 flags)
@@ -194,6 +184,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags)
return btrfs_raid_array[index].raid_name;
}
+int btrfs_nr_parity_stripes(u64 type)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);
+
+ return btrfs_raid_array[index].nparity;
+}
+
/*
* Fill @buf with textual description of @bg_flags, no more than @size_buf
* bytes including terminating null byte.
@@ -250,13 +247,12 @@ out_overflow:;
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
+ enum btrfs_map_op op, u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map);
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map);
/*
* Device locking
@@ -405,7 +401,6 @@ void btrfs_free_device(struct btrfs_device *device)
WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
- bio_put(device->flush_bio);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -643,7 +638,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
device->bdev = bdev;
@@ -1409,12 +1404,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
{
switch (device->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
- /*
- * We don't want to overwrite the superblock on the drive nor
- * any area used by the boot loader (grub for example), so we
- * make sure to start at an offset of at least 1MB.
- */
- return max_t(u64, start, SZ_1M);
+ return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
case BTRFS_CHUNK_ALLOC_ZONED:
/*
* We don't care about the starting region like regular
@@ -1896,23 +1886,18 @@ static void update_dev_time(const char *device_path)
path_put(&path);
}
-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- return PTR_ERR(trans);
- }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1923,21 +1908,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
if (ret) {
if (ret > 0)
ret = -ENOENT;
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- }
-
out:
btrfs_free_path(path);
- if (!ret)
- ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -2043,7 +2019,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
struct page *page;
int ret;
- disk_super = btrfs_read_dev_one_super(bdev, copy_num);
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
if (IS_ERR(disk_super))
continue;
@@ -2078,6 +2054,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2098,7 +2075,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
- goto out;
+ return ret;
device = btrfs_find_device(fs_info->fs_devices, args);
if (!device) {
@@ -2106,27 +2083,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = -ENOENT;
- goto out;
+ return ret;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
- ret = -ETXTBSY;
- goto out;
+ return -ETXTBSY;
}
- if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto out;
- }
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
- fs_info->fs_devices->rw_devices == 1) {
- ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto out;
- }
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
@@ -2139,14 +2111,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
if (ret)
goto error_undo;
- /*
- * TODO: the superblock still includes this device in its num_devices
- * counter although write_all_supers() is not locked out. This
- * could give a filesystem state which requires a degraded mount.
- */
- ret = btrfs_rm_dev_item(device);
- if (ret)
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (ret) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2229,7 +2209,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
free_fs_devices(cur_devices);
}
-out:
+ ret = btrfs_commit_transaction(trans);
+
return ret;
error_undo:
@@ -2240,7 +2221,7 @@ error_undo:
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
- goto out;
+ return ret;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
@@ -2366,8 +2347,11 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
&bdev, &disk_super);
- if (ret)
+ if (ret) {
+ btrfs_put_dev_args_from_path(args);
return ret;
+ }
+
args->devid = btrfs_stack_device_id(&disk_super->dev_item);
memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
@@ -2715,7 +2699,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ if (!bdev_nonrot(bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -4072,13 +4056,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
- if (fs_info->sectorsize < PAGE_SIZE &&
- bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- btrfs_err(fs_info,
- "RAID56 is not yet supported for sectorsize %u with page size %lu",
- fs_info->sectorsize, PAGE_SIZE);
- return false;
- }
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@@ -4439,10 +4416,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}
@@ -5098,26 +5077,16 @@ static void init_alloc_chunk_ctl_policy_regular(
struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
- u64 type = ctl->type;
+ struct btrfs_space_info *space_info;
- if (type & BTRFS_BLOCK_GROUP_DATA) {
- ctl->max_stripe_size = SZ_1G;
- ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
- } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- /* For larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
- ctl->max_stripe_size = SZ_1G;
- else
- ctl->max_stripe_size = SZ_256M;
- ctl->max_chunk_size = ctl->max_stripe_size;
- } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ctl->max_stripe_size = SZ_32M;
- ctl->max_chunk_size = 2 * ctl->max_stripe_size;
- ctl->devs_max = min_t(int, ctl->devs_max,
- BTRFS_MAX_DEVS_SYS_CHUNK);
- } else {
- BUG();
- }
+ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
+ ASSERT(space_info);
+
+ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
+ ctl->max_stripe_size = ctl->max_chunk_size;
+
+ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
+ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
/* We don't want a chunk larger than 10% of writable space */
ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
@@ -5300,6 +5269,9 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
ctl->stripe_size);
}
+ /* Stripe size should not go beyond 1G. */
+ ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
+
/* Align to BTRFS_STRIPE_LEN */
ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
ctl->chunk_size = ctl->stripe_size * data_stripes;
@@ -5625,7 +5597,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- bg->chunk_item_inserted = 1;
+ set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
@@ -5747,7 +5719,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
- int ret;
+ enum btrfs_raid_types index;
+ int ret = 1;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
@@ -5760,10 +5733,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
map = em->map_lookup;
- if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
- ret = map->num_stripes;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- ret = map->sub_stripes;
+ index = btrfs_bg_flags_to_raid_index(map->type);
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
+ ret = btrfs_raid_array[index].ncopies;
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
@@ -5775,8 +5749,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- else
- ret = 1;
free_extent_map(em);
down_read(&fs_info->dev_replace.rwsem);
@@ -5795,6 +5767,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return len;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) {
@@ -5812,6 +5787,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
struct map_lookup *map;
int ret = 0;
+ if (!btrfs_fs_incompat(fs_info, RAID56))
+ return 0;
+
em = btrfs_get_chunk_map(fs_info, logical, len);
if(!WARN_ON(IS_ERR(em))) {
@@ -5920,7 +5898,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
- atomic_set(&bioc->error, 0);
refcount_set(&bioc->refs, 1);
bioc->fs_info = fs_info;
@@ -5944,18 +5921,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc)
kfree(bioc);
}
-/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
* Please note that, discard won't be sent to target device of device
* replace.
*/
-static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 *length_ret,
- struct btrfs_io_context **bioc_ret)
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes)
{
struct extent_map *em;
struct map_lookup *map;
- struct btrfs_io_context *bioc;
+ struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
@@ -5964,29 +5940,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 stripe_cnt;
u64 stripe_len;
u64 stripe_offset;
- u64 num_stripes;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
- int ret = 0;
+ int ret;
int i;
- /* Discard always returns a bioc. */
- ASSERT(bioc_ret);
-
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
- return PTR_ERR(em);
+ return ERR_CAST(em);
map = em->map_lookup;
+
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
- goto out;
- }
+ goto out_free_map;
+}
offset = logical - em->start;
length = min_t(u64, em->start + em->len - logical, length);
@@ -6012,7 +5985,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- num_stripes = 1;
+ *num_stripes = 1;
stripe_index = 0;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
@@ -6022,7 +5995,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
sub_stripes = map->sub_stripes;
factor = map->num_stripes / sub_stripes;
- num_stripes = min_t(u64, map->num_stripes,
+ *num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= sub_stripes;
@@ -6032,31 +6005,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
last_stripe *= sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = map->num_stripes;
+ *num_stripes = map->num_stripes;
} else {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
}
- bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
- if (!bioc) {
+ stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
+ if (!stripes) {
ret = -ENOMEM;
- goto out;
+ goto out_free_map;
}
- for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical =
+ for (i = 0; i < *num_stripes; i++) {
+ stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+ stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
- bioc->stripes[i].length = stripes_per_dev *
- map->stripe_len;
+ stripes[i].length = stripes_per_dev * map->stripe_len;
if (i / sub_stripes < remaining_stripes)
- bioc->stripes[i].length += map->stripe_len;
+ stripes[i].length += map->stripe_len;
/*
* Special for the first stripe and
@@ -6067,17 +6039,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
* off end_off
*/
if (i < sub_stripes)
- bioc->stripes[i].length -= stripe_offset;
+ stripes[i].length -= stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
- bioc->stripes[i].length -= stripe_end_offset;
+ stripes[i].length -= stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
- bioc->stripes[i].length = length;
+ stripes[i].length = length;
}
stripe_index++;
@@ -6087,12 +6059,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
}
- *bioc_ret = bioc;
- bioc->map_type = map->type;
- bioc->num_stripes = num_stripes;
-out:
free_extent_map(em);
- return ret;
+ return stripes;
+out_free_map:
+ free_extent_map(em);
+ return ERR_PTR(ret);
}
/*
@@ -6122,7 +6093,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
- logical, &length, &bioc, 0, 0);
+ logical, &length, &bioc, NULL, NULL, 0);
if (ret) {
ASSERT(bioc == NULL);
return ret;
@@ -6183,9 +6154,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
cache = btrfs_lookup_block_group(fs_info, logical);
- spin_lock(&cache->lock);
- ret = cache->to_copy;
- spin_unlock(&cache->lock);
+ ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
return ret;
@@ -6235,7 +6204,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->stripes + i;
new->physical = old->physical;
- new->length = old->length;
new->dev = dev_replace->tgtdev;
bioc->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
@@ -6276,8 +6244,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
- tgtdev_stripe->length =
- bioc->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
bioc->tgtdev_map[index_srcdev] = num_stripes;
@@ -6320,7 +6286,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
- u64 stripe_len;
+ u32 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
@@ -6331,19 +6297,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
- /* Stripe where this block falls in */
- stripe_nr = div64_u64(offset, stripe_len);
- /* Offset of stripe in the chunk */
- stripe_offset = stripe_nr * stripe_len;
- if (offset < stripe_offset) {
- btrfs_crit(fs_info,
-"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
- stripe_offset, offset, em->start, logical, stripe_len);
- return -EINVAL;
- }
+ /*
+ * Stripe_nr is where this block falls in
+ * stripe_offset is the offset of this block in its stripe.
+ */
+ stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+ ASSERT(stripe_offset < U32_MAX);
- /* stripe_offset is the offset of this block in its stripe */
- stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
/* Only stripe based profiles needs to check against stripe length. */
@@ -6390,11 +6350,19 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
return 0;
}
+static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+ u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+{
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical = map->stripes[stripe_index].physical +
+ stripe_offset + stripe_nr * map->stripe_len;
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
- enum btrfs_map_op op,
- u64 logical, u64 *length,
+ enum btrfs_map_op op, u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret,
- int mirror_num, int need_raid_map)
+ struct btrfs_io_stripe *smap,
+ int *mirror_num_ret, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -6405,6 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int data_stripes;
int i;
int ret = 0;
+ int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
@@ -6505,6 +6474,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
@@ -6512,9 +6482,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
- max_errors = nr_parity_stripes(map);
+ max_errors = btrfs_chunk_max_errors(map);
- *length = map->stripe_len;
+ /* Return the length to the full stripe end */
+ *length = min(logical + *length,
+ raid56_full_stripe_start + em->start +
+ data_stripes * stripe_len) - logical;
stripe_index = 0;
stripe_offset = 0;
} else {
@@ -6561,6 +6534,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
tgtdev_indexes = num_stripes;
}
+ /*
+ * If this I/O maps to a single device, try to return the device and
+ * physical block information on the stack instead of allocating an
+ * I/O context structure.
+ */
+ if (smap && num_alloc_stripes == 1 &&
+ !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
+ (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+ !dev_replace->tgtdev)) {
+ if (patch_the_first_stripe_for_dev_replace) {
+ smap->dev = dev_replace->tgtdev;
+ smap->physical = physical_to_patch_in_first_stripe;
+ *mirror_num_ret = map->num_stripes + 1;
+ } else {
+ set_io_stripe(smap, map, stripe_index, stripe_offset,
+ stripe_nr);
+ *mirror_num_ret = mirror_num;
+ }
+ *bioc_ret = NULL;
+ ret = 0;
+ goto out;
+ }
+
bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
if (!bioc) {
ret = -ENOMEM;
@@ -6568,9 +6564,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
for (i = 0; i < num_stripes; i++) {
- bioc->stripes[i].physical = map->stripes[stripe_index].physical +
- stripe_offset + stripe_nr * map->stripe_len;
- bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+ set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
+ stripe_nr);
stripe_index++;
}
@@ -6637,12 +6632,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret, int mirror_num)
{
- if (op == BTRFS_MAP_DISCARD)
- return __btrfs_map_block_for_discard(fs_info, logical,
- length, bioc_ret);
-
return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
- mirror_num, 0);
+ NULL, &mirror_num, 0);
}
/* For Scrub/replace */
@@ -6650,89 +6641,179 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret)
{
- return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
+ return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+ NULL, NULL, 1);
}
-static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
+/*
+ * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+ btrfs_bio_end_io_t end_io, void *private)
{
- bio->bi_private = bioc->private;
- bio->bi_end_io = bioc->end_io;
- bio_endio(bio);
+ memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+ bbio->end_io = end_io;
+ bbio->private = private;
+}
- btrfs_put_bioc(bioc);
+/*
+ * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+ btrfs_bio_init(btrfs_bio(bio), end_io, private);
+ return bio;
+}
+
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private)
+{
+ struct bio *bio;
+ struct btrfs_bio *bbio;
+
+ ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+ bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+ bbio = btrfs_bio(bio);
+ btrfs_bio_init(bbio, end_io, private);
+
+ bio_trim(bio, offset >> 9, size >> 9);
+ bbio->iter = bio->bi_iter;
+ return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+ if (!dev || !dev->bdev)
+ return;
+ if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+ return;
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ if (!(bio->bi_opf & REQ_RAHEAD))
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+ if (bio->bi_opf & REQ_PREFLUSH)
+ btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
}
-static void btrfs_end_bio(struct bio *bio)
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ if (bio->bi_opf & REQ_META)
+ return fs_info->endio_meta_workers;
+ return fs_info->endio_workers;
+}
+
+static void btrfs_end_bio_work(struct work_struct *work)
+{
+ struct btrfs_bio *bbio =
+ container_of(work, struct btrfs_bio, end_io_work);
+
+ bbio->end_io(bbio);
+}
+
+static void btrfs_simple_end_io(struct bio *bio)
+{
+ struct btrfs_fs_info *fs_info = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(fs_info);
+
+ if (bio->bi_status)
+ btrfs_log_dev_io_error(bio, bbio->device);
+
+ if (bio_op(bio) == REQ_OP_READ) {
+ INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
+ queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
+ } else {
+ bbio->end_io(bbio);
+ }
+}
+
+static void btrfs_raid56_end_io(struct bio *bio)
{
struct btrfs_io_context *bioc = bio->bi_private;
- int is_orig_bio = 0;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
+ bbio->mirror_num = bioc->mirror_num;
+ bbio->end_io(bbio);
+
+ btrfs_put_bioc(bioc);
+}
+
+static void btrfs_orig_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
+ struct btrfs_io_context *bioc = stripe->bioc;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+
+ btrfs_bio_counter_dec(bioc->fs_info);
if (bio->bi_status) {
atomic_inc(&bioc->error);
- if (bio->bi_status == BLK_STS_IOERR ||
- bio->bi_status == BLK_STS_TARGET) {
- struct btrfs_device *dev = btrfs_bio(bio)->device;
-
- ASSERT(dev->bdev);
- if (btrfs_op(bio) == BTRFS_MAP_WRITE)
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_WRITE_ERRS);
- else if (!(bio->bi_opf & REQ_RAHEAD))
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_READ_ERRS);
- if (bio->bi_opf & REQ_PREFLUSH)
- btrfs_dev_stat_inc_and_print(dev,
- BTRFS_DEV_STAT_FLUSH_ERRS);
- }
+ btrfs_log_dev_io_error(bio, stripe->dev);
}
- if (bio == bioc->orig_bio)
- is_orig_bio = 1;
-
- btrfs_bio_counter_dec(bioc->fs_info);
+ /*
+ * Only send an error to the higher layers if it is beyond the tolerance
+ * threshold.
+ */
+ if (atomic_read(&bioc->error) > bioc->max_errors)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_status = BLK_STS_OK;
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- if (!is_orig_bio) {
- bio_put(bio);
- bio = bioc->orig_bio;
- }
+ bbio->end_io(bbio);
+ btrfs_put_bioc(bioc);
+}
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- /* only send an error to the higher layers if it is
- * beyond the tolerance of the btrfs bio
- */
- if (atomic_read(&bioc->error) > bioc->max_errors) {
- bio->bi_status = BLK_STS_IOERR;
- } else {
- /*
- * this bio is actually up to date, we didn't
- * go over the max number of errors
- */
- bio->bi_status = BLK_STS_OK;
- }
+static void btrfs_clone_write_end_io(struct bio *bio)
+{
+ struct btrfs_io_stripe *stripe = bio->bi_private;
- btrfs_end_bioc(bioc, bio);
- } else if (!is_orig_bio) {
- bio_put(bio);
+ if (bio->bi_status) {
+ atomic_inc(&stripe->bioc->error);
+ btrfs_log_dev_io_error(bio, stripe->dev);
}
+
+ /* Pass on control to the original bio this one was cloned from */
+ bio_endio(stripe->bioc->orig_bio);
+ bio_put(bio);
}
-static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
- u64 physical, struct btrfs_device *dev)
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
{
- struct btrfs_fs_info *fs_info = bioc->fs_info;
+ if (!dev || !dev->bdev ||
+ test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
+ (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
+ bio_io_error(bio);
+ return;
+ }
+
+ bio_set_dev(bio, dev->bdev);
- bio->bi_private = bioc;
- btrfs_bio(bio)->device = dev;
- bio->bi_end_io = btrfs_end_bio;
- bio->bi_iter.bi_sector = physical >> 9;
/*
* For zone append writing, bi_sector must point the beginning of the
* zone
*/
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
if (btrfs_dev_is_sequential(dev, physical)) {
- u64 zone_start = round_down(physical, fs_info->zone_size);
+ u64 zone_start = round_down(physical,
+ dev->fs_info->zone_size);
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
} else {
@@ -6740,78 +6821,54 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
bio->bi_opf |= REQ_OP_WRITE;
}
}
- btrfs_debug_in_rcu(fs_info,
- "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
+ btrfs_debug_in_rcu(dev->fs_info,
+ "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
+ __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
- bio_set_dev(bio, dev->bdev);
- btrfs_bio_counter_inc_noblocked(fs_info);
-
- btrfsic_submit_bio(bio);
+ btrfsic_check_bio(bio);
+ submit_bio(bio);
}
-static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
{
- atomic_inc(&bioc->error);
- if (atomic_dec_and_test(&bioc->stripes_pending)) {
- /* Should be the original bio. */
- WARN_ON(bio != bioc->orig_bio);
+ struct bio *orig_bio = bioc->orig_bio, *bio;
- btrfs_bio(bio)->mirror_num = bioc->mirror_num;
- bio->bi_iter.bi_sector = logical >> 9;
- if (atomic_read(&bioc->error) > bioc->max_errors)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_status = BLK_STS_OK;
- btrfs_end_bioc(bioc, bio);
+ ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+ /* Reuse the bio embedded into the btrfs_bio for the last mirror */
+ if (dev_nr == bioc->num_stripes - 1) {
+ bio = orig_bio;
+ bio->bi_end_io = btrfs_orig_write_end_io;
+ } else {
+ bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+ bio_inc_remaining(orig_bio);
+ bio->bi_end_io = btrfs_clone_write_end_io;
}
+
+ bio->bi_private = &bioc->stripes[dev_nr];
+ bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+ bioc->stripes[dev_nr].bioc = bioc;
+ btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num)
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
{
- struct btrfs_device *dev;
- struct bio *first_bio = bio;
u64 logical = bio->bi_iter.bi_sector << 9;
- u64 length = 0;
- u64 map_length;
- int ret;
- int dev_nr;
- int total_devs;
+ u64 length = bio->bi_iter.bi_size;
+ u64 map_length = length;
struct btrfs_io_context *bioc = NULL;
-
- length = bio->bi_iter.bi_size;
- map_length = length;
+ struct btrfs_io_stripe smap;
+ int ret;
btrfs_bio_counter_inc_blocked(fs_info);
- ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
- &map_length, &bioc, mirror_num, 1);
+ ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
- }
-
- total_devs = bioc->num_stripes;
- bioc->orig_bio = first_bio;
- bioc->private = first_bio->bi_private;
- bioc->end_io = first_bio->bi_end_io;
- atomic_set(&bioc->stripes_pending, bioc->num_stripes);
-
- if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
- /* In this case, map_length has been set to the length of
- a single stripe; not the whole write */
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- ret = raid56_parity_write(bio, bioc, map_length);
- } else {
- ret = raid56_parity_recover(bio, bioc, map_length,
- mirror_num, 1);
- }
-
- btrfs_bio_counter_dec(fs_info);
- return errno_to_blk_status(ret);
+ btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+ return;
}
if (map_length < length) {
@@ -6821,25 +6878,31 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
BUG();
}
- for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
- dev = bioc->stripes[dev_nr].dev;
- if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
- &dev->dev_state) ||
- (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
- bioc_error(bioc, first_bio, logical);
- continue;
- }
-
- if (dev_nr < total_devs - 1)
- bio = btrfs_bio_clone(first_bio);
+ if (!bioc) {
+ /* Single mirror read/write fast path */
+ btrfs_bio(bio)->mirror_num = mirror_num;
+ btrfs_bio(bio)->device = smap.dev;
+ bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ bio->bi_private = fs_info;
+ bio->bi_end_io = btrfs_simple_end_io;
+ btrfs_submit_dev_bio(smap.dev, bio);
+ } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ /* Parity RAID write or read recovery */
+ bio->bi_private = bioc;
+ bio->bi_end_io = btrfs_raid56_end_io;
+ if (bio_op(bio) == REQ_OP_READ)
+ raid56_parity_recover(bio, bioc, mirror_num);
else
- bio = first_bio;
+ raid56_parity_write(bio, bioc);
+ } else {
+ /* Write to multiple mirrors */
+ int total_devs = bioc->num_stripes;
+ int dev_nr;
- submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
+ bioc->orig_bio = bio;
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+ btrfs_submit_mirrored_bio(bioc, dev_nr);
}
- btrfs_bio_counter_dec(fs_info);
- return BLK_STS_OK;
}
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
@@ -6956,16 +7019,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (!dev)
return ERR_PTR(-ENOMEM);
- /*
- * Preallocate a bio that's always going to be used for flushing device
- * barriers and matches the device lifespan
- */
- dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- if (!dev->flush_bio) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
-
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
@@ -7007,11 +7060,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+u64 btrfs_calc_stripe_length(const struct extent_map *em)
{
- const int data_stripes = calc_data_stripes(type, num_stripes);
+ const struct map_lookup *map = em->map_lookup;
+ const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(chunk_len, data_stripes);
+ return div_u64(em->len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -7150,8 +7204,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->type = type;
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
- em->orig_block_len = calc_stripe_length(type, em->len,
- map->num_stripes);
+ em->orig_block_len = btrfs_calc_stripe_length(em);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7277,7 +7330,8 @@ static int read_one_dev(struct extent_buffer *leaf,
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
- devid = args.devid = btrfs_device_id(leaf, dev_item);
+ devid = btrfs_device_id(leaf, dev_item);
+ args.devid = devid;
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
@@ -7377,7 +7431,6 @@ static int read_one_dev(struct extent_buffer *leaf,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
@@ -7393,30 +7446,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
+
/*
- * This will create extent buffer of nodesize, superblock size is
- * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
- * overallocate but we can keep it as-is, only the first page is used.
+ * We allocated a dummy extent, just to use extent buffer accessors.
+ * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+ * that's fine, we will not go beyond system chunk array anyway.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
- root->root_key.objectid, 0);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
+ sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ if (!sb)
+ return -ENOMEM;
set_extent_buffer_uptodate(sb);
- /*
- * The sb extent buffer is artificial and just used to read the system array.
- * set_extent_buffer_uptodate() call does not properly mark all it's
- * pages up-to-date when the page is larger: extent does not cover the
- * whole page and consequently check_page_uptodate does not find all
- * the page's extents up-to-date (the hole beyond sb),
- * write_extent_buffer then triggers a WARN_ON.
- *
- * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
- * but sb spans only this function. Add an explicit SetPageUptodate call
- * to silence the warning eg. on PowerPC 64.
- */
- if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
- SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
@@ -7579,6 +7618,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
int ret;
int slot;
+ int iter_ret = 0;
u64 total_dev = 0;
u64 last_ra_node = 0;
@@ -7622,30 +7662,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
- struct extent_buffer *node;
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+ struct extent_buffer *node = path->nodes[1];
leaf = path->nodes[0];
slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- break;
- }
- node = path->nodes[1];
+
if (node) {
if (last_ra_node != node->start) {
readahead_tree_node_children(node);
last_ra_node = node->start;
}
}
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
@@ -7670,7 +7698,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto error;
}
- path->slots[0]++;
+ }
+ /* Catch error found during iteration */
+ if (iter_ret < 0) {
+ ret = iter_ret;
+ goto error;
}
/*
@@ -7678,12 +7710,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
@@ -7928,11 +7960,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
btrfs_dev_stat_inc(dev, index);
- btrfs_dev_stat_print_on_error(dev);
-}
-static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
-{
if (!dev->dev_stats_valid)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
@@ -8074,7 +8102,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
map = em->map_lookup;
- stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+ stripe_len = btrfs_calc_stripe_length(em);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
@@ -8084,6 +8112,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
+ /*
+ * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * space. Although kernel can handle it without problem, better to warn
+ * the users.
+ */
+ if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
+ btrfs_warn(fs_info,
+ "devid %llu physical %llu len %llu inside the reserved space",
+ devid, physical_offset, physical_len);
+
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->devid == devid &&
map->stripes[i].physical == physical_offset) {
@@ -8295,7 +8333,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
static int relocating_repair_kthread(void *data)
{
- struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+ struct btrfs_block_group *cache = data;
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 target;
int ret = 0;
@@ -8319,7 +8357,7 @@ static int relocating_repair_kthread(void *data)
if (!cache)
goto out;
- if (!cache->relocating_repair)
+ if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
goto out;
ret = btrfs_may_alloc_data_chunk(fs_info, target);
@@ -8356,17 +8394,27 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
if (!cache)
return true;
- spin_lock(&cache->lock);
- if (cache->relocating_repair) {
- spin_unlock(&cache->lock);
+ if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
btrfs_put_block_group(cache);
return true;
}
- cache->relocating_repair = 1;
- spin_unlock(&cache->lock);
kthread_run(relocating_repair_kthread, cache,
"btrfs-relocating-repair");
return true;
}
+
+int __init btrfs_bioset_init(void)
+{
+ if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_bio, bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+ return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+ bioset_exit(&btrfs_bioset);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bd297f23d19e..599b9d5af349 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,17 +17,51 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
+/* Used by sanity check for btrfs_raid_types. */
+#define const_ffs(n) (__builtin_ctzll(n) + 1)
+
+/*
+ * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
+ * RAID0 always to be the lowest profile bit.
+ * Although it's part of on-disk format and should never change, do extra
+ * compile-time sanity checks.
+ */
+static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
+ const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
+static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
+ ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+
+/* ilog2() can handle both constants and variables */
+#define BTRFS_BG_FLAG_TO_INDEX(profile) \
+ ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
+
+enum btrfs_raid_types {
+ /* SINGLE is the special one as it doesn't have on-disk bit. */
+ BTRFS_RAID_SINGLE = 0,
+
+ BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
+ BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
+ BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
+ BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
+ BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
+ BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
+ BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
+ BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
+
+ BTRFS_NR_RAID_TYPES
+};
+
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
/* offset of logical address in chunk */
u64 offset;
/* length of single IO stripe */
- u64 stripe_len;
+ u32 stripe_len;
+ /* offset of address in stripe */
+ u32 stripe_offset;
/* number of stripe where address falls */
u64 stripe_nr;
- /* offset of address in stripe */
- u64 stripe_offset;
/* offset of raid56 stripe into the chunk */
u64 raid56_stripe_offset;
};
@@ -121,8 +155,8 @@ struct btrfs_device {
/* bytes used on the current transaction */
u64 commit_bytes_used;
- /* for sending down flush barriers */
- struct bio *flush_bio;
+ /* Bio used for flushing device barriers */
+ struct bio flush_bio;
struct completion flush_wait;
/* per-device scrub information */
@@ -147,6 +181,31 @@ struct btrfs_device {
};
/*
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
+ */
+struct btrfs_swapfile_pin {
+ struct rb_node node;
+ void *ptr;
+ struct inode *inode;
+ /*
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
+ */
+ bool is_block_group;
+ /*
+ * Only used when 'is_block_group' is true and it is the number of
+ * extents used by a swapfile for this block group ('ptr' field).
+ */
+ int bg_extent_count;
+};
+
+/*
* If we read those variants at the context of their own lock, we needn't
* use the following helpers, reading them directly is safe.
*/
@@ -321,6 +380,15 @@ struct btrfs_fs_devices {
/ sizeof(struct btrfs_stripe) + 1)
/*
+ * Maximum number of sectors for a single bio to limit the size of the
+ * checksum array. This matches the number of bio_vecs per bio and thus the
+ * I/O size for buffered I/O.
+ */
+#define BTRFS_MAX_BIO_SECTORS (256)
+
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
+/*
* Additional info to pass along bio.
*
* Mostly for btrfs specific features like csum and mirror_num.
@@ -328,12 +396,22 @@ struct btrfs_fs_devices {
struct btrfs_bio {
unsigned int mirror_num;
+ /* for direct I/O */
+ u64 file_offset;
+
/* @device is for stripe IO submission. */
struct btrfs_device *device;
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
struct bvec_iter iter;
+ /* End I/O information supplied to btrfs_bio_alloc */
+ btrfs_bio_end_io_t end_io;
+ void *private;
+
+ /* For read end I/O handling */
+ struct work_struct end_io_work;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
@@ -346,6 +424,20 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
return container_of(bio, struct btrfs_bio, bio);
}
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+ btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+ btrfs_bio_end_io_t end_io, void *private);
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+ bbio->bio.bi_status = status;
+ bbio->end_io(bbio);
+}
+
static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
if (bbio->csum != bbio->csum_inline) {
@@ -354,10 +446,36 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
}
}
+/*
+ * Iterate through a btrfs_bio (@bbio) on a per-sector basis.
+ *
+ * bvl - struct bio_vec
+ * bbio - struct btrfs_bio
+ * iters - struct bvec_iter
+ * bio_offset - unsigned int
+ */
+#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
+ for ((iter) = (bbio)->iter, (bio_offset) = 0; \
+ (iter).bi_size && \
+ (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
+ (bio_offset) += fs_info->sectorsize, \
+ bio_advance_iter_single(&(bbio)->bio, &(iter), \
+ (fs_info)->sectorsize))
+
struct btrfs_io_stripe {
struct btrfs_device *dev;
+ union {
+ /* Block mapping */
+ u64 physical;
+ /* For the endio handler */
+ struct btrfs_io_context *bioc;
+ };
+};
+
+struct btrfs_discard_stripe {
+ struct btrfs_device *dev;
u64 physical;
- u64 length; /* only used for discard mappings */
+ u64 length;
};
/*
@@ -378,12 +496,9 @@ struct btrfs_io_stripe {
*/
struct btrfs_io_context {
refcount_t refs;
- atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
u64 map_type; /* get from map_lookup->type */
- bio_end_io_t *end_io;
struct bio *orig_bio;
- void *private;
atomic_t error;
int max_errors;
int num_stripes;
@@ -427,7 +542,7 @@ struct map_lookup {
u64 type;
int io_align;
int io_width;
- u64 stripe_len;
+ u32 stripe_len;
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */
@@ -496,6 +611,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_io_context **bioc_ret);
+struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 *length_ret,
+ u32 *num_stripes);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
@@ -504,8 +622,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
-blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num);
+void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
@@ -564,6 +681,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
+u64 btrfs_calc_stripe_length(const struct extent_map *em);
+int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
@@ -637,4 +756,6 @@ const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 99abf41b89b9..5bb8d8c86311 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
out:
if (start_trans)
btrfs_end_transaction(trans);
@@ -271,10 +272,12 @@ out:
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
+ struct btrfs_key found_key;
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
+ int iter_ret = 0;
int ret = 0;
size_t total_size = 0, size_left = size;
@@ -293,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path->reada = READA_FORWARD;
/* search for our xattrs */
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto err;
-
- while (1) {
+ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf;
int slot;
struct btrfs_dir_item *di;
- struct btrfs_key found_key;
u32 item_size;
u32 cur;
leaf = path->nodes[0];
slot = path->slots[0];
- /* this is where we start walking through the path */
- if (slot >= btrfs_header_nritems(leaf)) {
- /*
- * if we've reached the last slot in this leaf we need
- * to go to the next leaf and reset everything
- */
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
if (found_key.type < BTRFS_XATTR_ITEM_KEY)
- goto next_item;
+ continue;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
item_size = btrfs_item_size(leaf, slot);
@@ -350,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto next;
if (!buffer || (name_len + 1) > size_left) {
- ret = -ERANGE;
- goto err;
+ iter_ret = -ERANGE;
+ break;
}
read_extent_buffer(leaf, buffer, name_ptr, name_len);
@@ -363,12 +345,13 @@ next:
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
-next_item:
- path->slots[0]++;
}
- ret = total_size;
-err:
+ if (iter_ret < 0)
+ ret = iter_ret;
+ else
+ ret = total_size;
+
btrfs_free_path(path);
return ret;
@@ -388,6 +371,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
const char *name, const void *buffer,
size_t size, int flags)
{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
@@ -403,10 +389,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct btrfs_root *root = BTRFS_I(inode)->root;
name = xattr_full_name(handler, name);
- ret = btrfs_validate_prop(name, value, size);
+ ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size);
if (ret)
return ret;
+ if (btrfs_ignore_prop(BTRFS_I(inode), name))
+ return 0;
+
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -416,7 +405,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- BUG_ON(ret);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
btrfs_end_transaction(trans);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 767a0c6c9694..b4f44662cda7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
- char *data_in;
+ char *data_in = NULL;
char *cpage_out;
int nr_pages = 0;
struct page *in_page = NULL;
@@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[0] = out_page;
nr_pages = 1;
@@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
int i;
for (i = 0; i < in_buf_pages; i++) {
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = kmap_local_page(in_page);
memcpy(workspace->buf + i * PAGE_SIZE,
data_in, PAGE_SIZE);
start += PAGE_SIZE;
}
workspace->strm.next_in = workspace->buf;
} else {
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
in_page = find_get_page(mapping,
start >> PAGE_SHIFT);
- data_in = kmap(in_page);
+ data_in = kmap_local_page(in_page);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
}
@@ -196,9 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
* the stream end if required
*/
if (workspace->strm.avail_out == 0) {
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -207,7 +205,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -234,9 +232,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
} else if (workspace->strm.avail_out == 0) {
/* get another page for the stream end */
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -245,7 +241,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
- cpage_out = kmap(out_page);
+ cpage_out = page_address(out_page);
pages[nr_pages] = out_page;
nr_pages++;
workspace->strm.avail_out = PAGE_SIZE;
@@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_in = workspace->strm.total_in;
out:
*out_pages = nr_pages;
- if (out_page)
- kunmap(out_page);
-
- if (in_page) {
- kunmap(in_page);
+ if (data_in) {
+ kunmap_local(data_in);
put_page(in_page);
}
+
return ret;
}
@@ -287,7 +281,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
unsigned long buf_start;
struct page **pages_in = cb->compressed_pages;
- data_in = kmap(pages_in[page_in_index]);
+ data_in = kmap_local_page(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
@@ -309,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
pr_warn("BTRFS: inflateInit failed\n");
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -336,13 +330,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
page_in_index++;
if (page_in_index >= total_pages_in) {
data_in = NULL;
break;
}
- data_in = kmap(pages_in[page_in_index]);
+ data_in = kmap_local_page(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
@@ -355,7 +349,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
- kunmap(pages_in[page_in_index]);
+ kunmap_local(data_in);
if (!ret)
zero_fill_bio(cb->orig_bio);
return ret;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b7b5fac1c779..e2d073b08a7d 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -51,11 +51,13 @@
#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
/*
- * Maximum supported zone size. Currently, SMR disks have a zone size of
- * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
- * expect the zone size to become larger than 8GiB in the near future.
+ * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+ * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+ * We do not expect the zone size to become larger than 8GiB or smaller than
+ * 4MiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
+#define BTRFS_MIN_ZONE_SIZE SZ_4M
#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
@@ -92,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
* Possible states of log buffer zones
*
* Empty[0] In use[0] Full[0]
- * Empty[1] * x 0
- * In use[1] 0 x 0
- * Full[1] 1 1 C
+ * Empty[1] * 0 1
+ * In use[1] x x 1
+ * Full[1] 0 0 C
*
* Log position:
* *: Special case, no superblock is written
@@ -350,7 +352,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
- struct request_queue *queue = bdev_get_queue(bdev);
unsigned int max_active_zones;
unsigned int nactive;
sector_t nr_sectors;
@@ -402,15 +403,41 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
+ } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu smaller than supported minimum %u",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
}
nr_sectors = bdev_nr_sectors(bdev);
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ /*
+ * We limit max_zone_append_size also by max_segments *
+ * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
+ * since btrfs adds the pages one by one to a bio, and btrfs cannot
+ * increase the metadata reservation even if it increases the number of
+ * extents, it is safe to stick with the limit.
+ *
+ * With the zoned emulation, we can have non-zoned device on the zoned
+ * mode. In this case, we don't have a valid max zone append size. So,
+ * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size.
+ */
+ if (bdev_is_zoned(bdev)) {
+ zone_info->max_zone_append_size = min_t(u64,
+ (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ } else {
+ zone_info->max_zone_append_size =
+ (u64)bdev_max_segments(bdev) << PAGE_SHIFT;
+ }
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = queue_max_active_zones(queue);
+ max_active_zones = bdev_max_active_zones(bdev);
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
btrfs_err_in_rcu(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
@@ -625,74 +652,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
return 0;
}
+static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *device;
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ if (device->bdev &&
+ bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found: %pg",
+ device->bdev);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- u64 zoned_devices = 0;
- u64 nr_devices = 0;
u64 zone_size = 0;
- const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
- int ret = 0;
+ u64 max_zone_append_size = 0;
+ int ret;
- /* Count zoned devices */
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- enum blk_zoned_model model;
+ /*
+ * Host-Managed devices can't be used without the ZONED flag. With the
+ * ZONED all devices can be used, using zone emulation if required.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return btrfs_check_for_zoned_device(fs_info);
+
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
if (!device->bdev)
continue;
- model = bdev_zoned_model(device->bdev);
- /*
- * A Host-Managed zoned device must be used as a zoned device.
- * A Host-Aware zoned device and a non-zoned devices can be
- * treated as a zoned device, if ZONED flag is enabled in the
- * superblock.
- */
- if (model == BLK_ZONED_HM ||
- (model == BLK_ZONED_HA && incompat_zoned) ||
- (model == BLK_ZONED_NONE && incompat_zoned)) {
- struct btrfs_zoned_device_info *zone_info;
-
- zone_info = device->zone_info;
- zoned_devices++;
- if (!zone_size) {
- zone_size = zone_info->zone_size;
- } else if (zone_info->zone_size != zone_size) {
- btrfs_err(fs_info,
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
"zoned: unequal block device zone sizes: have %llu found %llu",
- device->zone_info->zone_size,
- zone_size);
- ret = -EINVAL;
- goto out;
- }
+ zone_info->zone_size, zone_size);
+ return -EINVAL;
}
- nr_devices++;
- }
-
- if (!zoned_devices && !incompat_zoned)
- goto out;
-
- if (!zoned_devices && incompat_zoned) {
- /* No zoned block device found on ZONED filesystem */
- btrfs_err(fs_info,
- "zoned: no zoned devices found on a zoned filesystem");
- ret = -EINVAL;
- goto out;
- }
-
- if (zoned_devices && !incompat_zoned) {
- btrfs_err(fs_info,
- "zoned: mode not enabled but zoned device found");
- ret = -EINVAL;
- goto out;
- }
-
- if (zoned_devices != nr_devices) {
- btrfs_err(fs_info,
- "zoned: cannot mix zoned and regular devices");
- ret = -EINVAL;
- goto out;
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size = zone_info->max_zone_append_size;
}
/*
@@ -704,18 +712,20 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info,
"zoned: zone size %llu not aligned to stripe %u",
zone_size, BTRFS_STRIPE_LEN);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
btrfs_err(fs_info, "zoned: mixed block groups not supported");
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
fs_info->zone_size = zone_size;
+ fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
+ fs_info->sectorsize);
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+ if (fs_info->max_zone_append_size < fs_info->max_extent_size)
+ fs_info->max_extent_size = fs_info->max_zone_append_size;
/*
* Check mount options here, because we might change fs_info->zoned
@@ -723,11 +733,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
*/
ret = btrfs_check_mountopts_zoned(fs_info);
if (ret)
- goto out;
+ return ret;
btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
-out:
- return ret;
+ return 0;
}
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
@@ -1150,7 +1159,7 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
* offset.
*/
static int calculate_alloc_pointer(struct btrfs_block_group *cache,
- u64 *offset_ret)
+ u64 *offset_ret, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_root *root;
@@ -1160,6 +1169,21 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
int ret;
u64 length;
+ /*
+ * Avoid tree lookups for a new block group, there's no use for it.
+ * It must always be 0.
+ *
+ * Also, we have a lock chain of extent buffer lock -> chunk mutex.
+ * For new a block group, this function is called from
+ * btrfs_make_block_group() which is already taking the chunk mutex.
+ * Thus, we cannot call calculate_alloc_pointer() which takes extent
+ * buffer locks to avoid deadlock.
+ */
+ if (new) {
+ *offset_ret = 0;
+ return 0;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1295,6 +1319,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
else
num_conventional++;
+ /*
+ * Consider a zone as active if we can allow any number of
+ * active zones.
+ */
+ if (!device->zone_info->max_active_zones)
+ __set_bit(i, active);
+
if (!is_sequential) {
alloc_offsets[i] = WP_CONVENTIONAL;
continue;
@@ -1361,45 +1392,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
__set_bit(i, active);
break;
}
-
- /*
- * Consider a zone as active if we can allow any number of
- * active zones.
- */
- if (!device->zone_info->max_active_zones)
- __set_bit(i, active);
}
if (num_sequential > 0)
cache->seq_zone = true;
if (num_conventional > 0) {
- /*
- * Avoid calling calculate_alloc_pointer() for new BG. It
- * is no use for new BG. It must be always 0.
- *
- * Also, we have a lock chain of extent buffer lock ->
- * chunk mutex. For new BG, this function is called from
- * btrfs_make_block_group() which is already taking the
- * chunk mutex. Thus, we cannot call
- * calculate_alloc_pointer() which takes extent buffer
- * locks to avoid deadlock.
- */
-
/* Zone capacity is always zone size in emulation */
cache->zone_capacity = cache->length;
- if (new) {
- cache->alloc_offset = 0;
- goto out;
- }
- ret = calculate_alloc_pointer(cache, &last_alloc);
- if (ret || map->num_stripes == num_conventional) {
- if (!ret)
- cache->alloc_offset = last_alloc;
- else
- btrfs_err(fs_info,
+ ret = calculate_alloc_pointer(cache, &last_alloc, new);
+ if (ret) {
+ btrfs_err(fs_info,
"zoned: failed to determine allocation offset of bg %llu",
- cache->start);
+ cache->start);
+ goto out;
+ } else if (map->num_stripes == num_conventional) {
+ cache->alloc_offset = last_alloc;
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
}
@@ -1415,7 +1424,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
cache->alloc_offset = alloc_offsets[0];
cache->zone_capacity = caps[0];
- cache->zone_is_active = test_bit(0, active);
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
break;
case BTRFS_BLOCK_GROUP_DUP:
if (map->type & BTRFS_BLOCK_GROUP_DATA) {
@@ -1449,7 +1459,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
} else {
- cache->zone_is_active = test_bit(0, active);
+ if (test_bit(0, active))
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &cache->runtime_flags);
}
cache->alloc_offset = alloc_offsets[0];
cache->zone_capacity = min(caps[0], caps[1]);
@@ -1467,13 +1479,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
- if (cache->zone_is_active) {
- btrfs_get_block_group(cache);
- spin_lock(&fs_info->zone_active_bgs_lock);
- list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
- }
-
out:
if (cache->alloc_offset > fs_info->zone_size) {
btrfs_err(fs_info,
@@ -1498,10 +1503,16 @@ out:
ret = -EIO;
}
- if (!ret)
+ if (!ret) {
cache->meta_write_pointer = cache->alloc_offset + cache->start;
-
- if (ret) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
+ btrfs_get_block_group(cache);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&cache->active_bg_list,
+ &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+ } else {
kfree(cache->physical_map);
cache->physical_map = NULL;
}
@@ -1527,7 +1538,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
free = cache->zone_capacity - cache->alloc_offset;
/* We only need ->free_space in ALLOC_SEQ block groups */
- cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
cache->free_space_ctl->free_space = free;
cache->zone_unusable = unusable;
@@ -1727,12 +1737,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&mapped_length, &bioc);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
- btrfs_put_bioc(bioc);
- return -EIO;
+ ret = -EIO;
+ goto out_put_bioc;
}
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- return -EINVAL;
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EINVAL;
+ goto out_put_bioc;
+ }
nofs_flag = memalloc_nofs_save();
nmirrors = (int)bioc->num_stripes;
@@ -1751,7 +1763,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
break;
}
memalloc_nofs_restore(nofs_flag);
-
+out_put_bioc:
+ btrfs_put_bioc(bioc);
return ret;
}
@@ -1801,7 +1814,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
map = em->map_lookup;
/* We only support single profile for now */
- ASSERT(map->num_stripes == 1);
device = map->stripes[0].dev;
free_extent_map(em);
@@ -1819,6 +1831,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
struct map_lookup *map;
struct btrfs_device *device;
u64 physical;
@@ -1830,12 +1843,19 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->zone_is_active) {
+ if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
goto out_unlock;
}
+ /* No space left */
+ if (btrfs_zoned_bg_is_full(block_group)) {
+ ret = false;
+ goto out_unlock;
+ }
+
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
@@ -1843,110 +1863,143 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
if (device->zone_info->max_active_zones == 0)
continue;
- /* No space left */
- if (block_group->alloc_offset == block_group->zone_capacity) {
- ret = false;
- goto out_unlock;
- }
-
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
goto out_unlock;
}
-
- /* Successfully activated all the zones */
- if (i == map->num_stripes - 1)
- block_group->zone_is_active = 1;
-
-
}
+
+ /* Successfully activated all the zones */
+ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
+ space_info->active_total_bytes += block_group->length;
spin_unlock(&block_group->lock);
+ btrfs_try_granting_tickets(fs_info, space_info);
+ spin_unlock(&space_info->lock);
- if (block_group->zone_is_active) {
- /* For the active block group list */
- btrfs_get_block_group(block_group);
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
- spin_lock(&fs_info->zone_active_bgs_lock);
- list_add_tail(&block_group->active_bg_list,
- &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
- }
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return true;
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
return ret;
}
-int btrfs_zone_finish(struct btrfs_block_group *block_group)
+static void wait_eb_writebacks(struct btrfs_block_group *block_group)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ const u64 end = block_group->start + block_group->length;
+ struct radix_tree_iter iter;
+ struct extent_buffer *eb;
+ void __rcu **slot;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
+ block_group->start >> fs_info->sectorsize_bits) {
+ eb = radix_tree_deref_slot(slot);
+ if (!eb)
+ continue;
+ if (radix_tree_deref_retry(eb)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ if (eb->start < block_group->start)
+ continue;
+ if (eb->start >= end)
+ break;
+
+ slot = radix_tree_iter_resume(slot, &iter);
+ rcu_read_unlock();
+ wait_on_extent_buffer_writeback(eb);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct map_lookup *map;
- struct btrfs_device *device;
- u64 physical;
+ const bool is_metadata = (block_group->flags &
+ (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
int ret = 0;
int i;
- if (!btrfs_is_zoned(fs_info))
- return 0;
-
- map = block_group->physical_map;
-
spin_lock(&block_group->lock);
- if (!block_group->zone_is_active) {
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
return 0;
}
/* Check if we have unwritten allocated space */
- if ((block_group->flags &
- (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
- block_group->alloc_offset > block_group->meta_write_pointer) {
+ if (is_metadata &&
+ block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
spin_unlock(&block_group->lock);
return -EAGAIN;
}
- spin_unlock(&block_group->lock);
-
- ret = btrfs_inc_block_group_ro(block_group, false);
- if (ret)
- return ret;
-
- /* Ensure all writes in this block group finish */
- btrfs_wait_block_group_reservations(block_group);
- /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
- btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
- block_group->length);
-
- spin_lock(&block_group->lock);
/*
- * Bail out if someone already deactivated the block group, or
- * allocated space is left in the block group.
+ * If we are sure that the block group is full (= no more room left for
+ * new allocation) and the IO for the last usable block is completed, we
+ * don't need to wait for the other IOs. This holds because we ensure
+ * the sequential IO submissions using the ZONE_APPEND command for data
+ * and block_group->meta_write_pointer for metadata.
*/
- if (!block_group->zone_is_active) {
+ if (!fully_written) {
spin_unlock(&block_group->lock);
- btrfs_dec_block_group_ro(block_group);
- return 0;
- }
- if (block_group->reserved) {
- spin_unlock(&block_group->lock);
- btrfs_dec_block_group_ro(block_group);
- return -EAGAIN;
+ ret = btrfs_inc_block_group_ro(block_group, false);
+ if (ret)
+ return ret;
+
+ /* Ensure all writes in this block group finish */
+ btrfs_wait_block_group_reservations(block_group);
+ /* No need to wait for NOCOW writers. Zoned mode does not allow that */
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+ block_group->length);
+ /* Wait for extent buffers to be written. */
+ if (is_metadata)
+ wait_eb_writebacks(block_group);
+
+ spin_lock(&block_group->lock);
+
+ /*
+ * Bail out if someone already deactivated the block group, or
+ * allocated space is left in the block group.
+ */
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &block_group->runtime_flags)) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return 0;
+ }
+
+ if (block_group->reserved) {
+ spin_unlock(&block_group->lock);
+ btrfs_dec_block_group_ro(block_group);
+ return -EAGAIN;
+ }
}
- block_group->zone_is_active = 0;
+ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
btrfs_clear_treelog_bg(block_group);
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
+ map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- device = map->stripes[i].dev;
- physical = map->stripes[i].physical;
+ struct btrfs_device *device = map->stripes[i].dev;
+ const u64 physical = map->stripes[i].physical;
if (device->zone_info->max_active_zones == 0)
continue;
@@ -1961,7 +2014,9 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
btrfs_dev_clear_active_zone(device, physical);
}
- btrfs_dec_block_group_ro(block_group);
+
+ if (!fully_written)
+ btrfs_dec_block_group_ro(block_group);
spin_lock(&fs_info->zone_active_bgs_lock);
ASSERT(!list_empty(&block_group->active_bg_list));
@@ -1971,23 +2026,31 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
/* For active_bg_list */
btrfs_put_block_group(block_group);
+ clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
+
return 0;
}
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return 0;
+
+ return do_zone_finish(block_group, false);
+}
+
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
{
+ struct btrfs_fs_info *fs_info = fs_devices->fs_info;
struct btrfs_device *device;
bool ret = false;
- if (!btrfs_is_zoned(fs_devices->fs_info))
+ if (!btrfs_is_zoned(fs_info))
return true;
- /* Non-single profiles are not supported yet */
- ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0);
-
/* Check if there is a device with active zones left */
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ mutex_lock(&fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
struct btrfs_zoned_device_info *zinfo = device->zone_info;
if (!device->bdev)
@@ -1999,7 +2062,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
break;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (!ret)
+ set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
return ret;
}
@@ -2007,9 +2073,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
- struct map_lookup *map;
- struct btrfs_device *device;
- u64 physical;
+ u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
return;
@@ -2017,42 +2081,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
block_group = btrfs_lookup_block_group(fs_info, logical);
ASSERT(block_group);
- if (logical + length < block_group->start + block_group->zone_capacity)
- goto out;
-
- spin_lock(&block_group->lock);
+ /* No MIXED_BG on zoned btrfs. */
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ min_alloc_bytes = fs_info->sectorsize;
+ else
+ min_alloc_bytes = fs_info->nodesize;
- if (!block_group->zone_is_active) {
- spin_unlock(&block_group->lock);
+ /* Bail out if we can allocate more data from this block group. */
+ if (logical + length + min_alloc_bytes <=
+ block_group->start + block_group->zone_capacity)
goto out;
- }
- block_group->zone_is_active = 0;
- /* We should have consumed all the free space */
- ASSERT(block_group->alloc_offset == block_group->zone_capacity);
- ASSERT(block_group->free_space_ctl->free_space == 0);
- btrfs_clear_treelog_bg(block_group);
- btrfs_clear_data_reloc_bg(block_group);
- spin_unlock(&block_group->lock);
+ do_zone_finish(block_group, true);
- map = block_group->physical_map;
- device = map->stripes[0].dev;
- physical = map->stripes[0].physical;
+out:
+ btrfs_put_block_group(block_group);
+}
- if (!device->zone_info->max_active_zones)
- goto out;
+static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+{
+ struct btrfs_block_group *bg =
+ container_of(work, struct btrfs_block_group, zone_finish_work);
- btrfs_dev_clear_active_zone(device, physical);
+ wait_on_extent_buffer_writeback(bg->last_eb);
+ free_extent_buffer(bg->last_eb);
+ btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+ btrfs_put_block_group(bg);
+}
- spin_lock(&fs_info->zone_active_bgs_lock);
- ASSERT(!list_empty(&block_group->active_bg_list));
- list_del_init(&block_group->active_bg_list);
- spin_unlock(&fs_info->zone_active_bgs_lock);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb)
+{
+ if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+ return;
- btrfs_put_block_group(block_group);
+ if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+ btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+ bg->start);
+ return;
+ }
-out:
- btrfs_put_block_group(block_group);
+ /* For the work */
+ btrfs_get_block_group(bg);
+ atomic_inc(&eb->refs);
+ bg->last_eb = eb;
+ INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+ queue_work(system_unbound_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@ -2082,3 +2156,153 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_devices->device_list_mutex);
}
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 used = 0;
+ u64 total = 0;
+ u64 factor;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+
+ if (fs_info->bg_reclaim_threshold == 0)
+ return false;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ total += device->disk_total_bytes;
+ used += device->bytes_used;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ factor = div64_u64(used * 100, total);
+ return factor >= fs_info->bg_reclaim_threshold;
+}
+
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ /* It should be called on a previous data relocation block group. */
+ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
+
+ spin_lock(&block_group->lock);
+ if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
+ goto out;
+
+ /* All relocation extents are written. */
+ if (block_group->start + block_group->alloc_offset == logical + length) {
+ /* Now, release this block group for further allocations. */
+ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+ &block_group->runtime_flags);
+ }
+
+out:
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+}
+
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group;
+ struct btrfs_block_group *min_bg = NULL;
+ u64 min_avail = U64_MAX;
+ int ret;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs,
+ active_bg_list) {
+ u64 avail;
+
+ spin_lock(&block_group->lock);
+ if (block_group->reserved ||
+ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
+ spin_unlock(&block_group->lock);
+ continue;
+ }
+
+ avail = block_group->zone_capacity - block_group->alloc_offset;
+ if (min_avail > avail) {
+ if (min_bg)
+ btrfs_put_block_group(min_bg);
+ min_bg = block_group;
+ min_avail = avail;
+ btrfs_get_block_group(min_bg);
+ }
+ spin_unlock(&block_group->lock);
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ if (!min_bg)
+ return 0;
+
+ ret = btrfs_zone_finish(min_bg);
+ btrfs_put_block_group(min_bg);
+
+ return ret < 0 ? ret : 1;
+}
+
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ struct btrfs_block_group *bg;
+ int index;
+
+ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
+
+ /* No more block groups to activate */
+ if (space_info->active_total_bytes == space_info->total_bytes)
+ return 0;
+
+ for (;;) {
+ int ret;
+ bool need_finish = false;
+
+ down_read(&space_info->groups_sem);
+ for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
+ list_for_each_entry(bg, &space_info->block_groups[index],
+ list) {
+ if (!spin_trylock(&bg->lock))
+ continue;
+ if (btrfs_zoned_bg_is_full(bg) ||
+ test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+ &bg->runtime_flags)) {
+ spin_unlock(&bg->lock);
+ continue;
+ }
+ spin_unlock(&bg->lock);
+
+ if (btrfs_zone_activate(bg)) {
+ up_read(&space_info->groups_sem);
+ return 1;
+ }
+
+ need_finish = true;
+ }
+ }
+ up_read(&space_info->groups_sem);
+
+ if (!do_finish || !need_finish)
+ break;
+
+ ret = btrfs_zone_finish_one_bg(fs_info);
+ if (ret == 0)
+ break;
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index cbf016a7bb5d..e17462db3a84 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -10,11 +10,7 @@
#include "block-group.h"
#include "btrfs_inode.h"
-/*
- * Block groups with more than this value (percents) of unusable space will be
- * scheduled for background reclaim.
- */
-#define BTRFS_DEFAULT_RECLAIM_THRESH 75
+#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
struct btrfs_zoned_device_info {
/*
@@ -23,6 +19,7 @@ struct btrfs_zoned_device_info {
*/
u64 zone_size;
u8 zone_size_shift;
+ u64 max_zone_append_size;
u32 nr_zones;
unsigned int max_active_zones;
atomic_t active_zones_left;
@@ -76,8 +73,16 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
+void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
+int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, bool do_finish);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -233,9 +238,34 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
u64 logical, u64 length) { }
+static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+ struct extent_buffer *eb) { }
+
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
+
+static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
+
+static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+{
+ return 1;
+}
+
+static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ bool do_finish)
+{
+ /* Consider all the block groups are active */
+ return 0;
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -359,7 +389,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_lock(&inode->vfs_inode, 0);
+ mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
}
static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
@@ -367,7 +397,13 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_unlock(&inode->vfs_inode, 0);
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
+}
+
+static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
+{
+ ASSERT(btrfs_is_zoned(bg->fs_info));
+ return (bg->alloc_offset == bg->zone_capacity);
}
#endif
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index fc42dd0badd7..35a0224d4eb7 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_alloc_workspace(unsigned int level);
-/*
- * zstd_reclaim_timer_fn - reclaim timer
+
+/**
+ * Timer callback to free unused workspaces.
+ *
* @t: timer
*
* This scans the lru_list and attempts to reclaim any workspace that hasn't
* been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
+ *
+ * The context is softirq and does not need the _bh locking primitives.
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- spin_lock_bh(&wsm.lock);
+ spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
return;
}
@@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock_bh(&wsm.lock);
+ spin_unlock(&wsm.lock);
}
/*
@@ -399,7 +403,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* map in the first page of input data */
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
@@ -411,7 +415,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
@@ -446,9 +450,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
if (workspace->out_buf.pos == workspace->out_buf.size) {
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -458,7 +460,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out,
PAGE_SIZE);
@@ -473,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE;
- kunmap(in_page);
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
-
start += PAGE_SIZE;
len -= PAGE_SIZE;
in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- workspace->in_buf.src = kmap(in_page);
+ workspace->in_buf.src = kmap_local_page(in_page);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
}
@@ -506,9 +507,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
tot_out += PAGE_SIZE;
max_out -= PAGE_SIZE;
- kunmap(out_page);
if (nr_pages == nr_dest_pages) {
- out_page = NULL;
ret = -E2BIG;
goto out;
}
@@ -518,7 +517,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
goto out;
}
pages[nr_pages++] = out_page;
- workspace->out_buf.dst = kmap(out_page);
+ workspace->out_buf.dst = page_address(out_page);
workspace->out_buf.pos = 0;
workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
}
@@ -533,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
*total_out = tot_out;
out:
*out_pages = nr_pages;
- /* Cleanup */
- if (in_page) {
- kunmap(in_page);
+ if (workspace->in_buf.src) {
+ kunmap_local(workspace->in_buf.src);
put_page(in_page);
}
- if (out_page)
- kunmap(out_page);
return ret;
}
@@ -563,7 +559,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
goto done;
}
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
@@ -599,14 +595,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
break;
if (workspace->in_buf.pos == workspace->in_buf.size) {
- kunmap(pages_in[page_in_index++]);
+ kunmap_local(workspace->in_buf.src);
+ page_in_index++;
if (page_in_index >= total_pages_in) {
workspace->in_buf.src = NULL;
ret = -EIO;
goto done;
}
srclen -= PAGE_SIZE;
- workspace->in_buf.src = kmap(pages_in[page_in_index]);
+ workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]);
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
}
@@ -615,7 +612,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
zero_fill_bio(cb->orig_bio);
done:
if (workspace->in_buf.src)
- kunmap(pages_in[page_in_index]);
+ kunmap_local(workspace->in_buf.src);
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 2b5561ae5d0b..0a7ba84c1905 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,8 +52,8 @@
#include "internal.h"
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc);
+static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -79,26 +79,26 @@ void unlock_buffer(struct buffer_head *bh)
EXPORT_SYMBOL(unlock_buffer);
/*
- * Returns if the page has dirty or writeback buffers. If all the buffers
- * are unlocked and clean then the PageDirty information is stale. If
- * any of the pages are locked, it is assumed they are locked for IO.
+ * Returns if the folio has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the folio_test_dirty information is stale. If
+ * any of the buffers are locked, it is assumed they are locked for IO.
*/
-void buffer_check_dirty_writeback(struct page *page,
+void buffer_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct buffer_head *head, *bh;
*dirty = false;
*writeback = false;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
return;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
*writeback = true;
- head = page_buffers(page);
bh = head;
do {
if (buffer_locked(bh))
@@ -282,10 +282,10 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
- * If none of the buffers had errors and they are all
- * uptodate then we can set the page uptodate.
+ * If all of the buffers are uptodate then we can set the page
+ * uptodate.
*/
- if (page_uptodate && !PageError(page))
+ if (page_uptodate)
SetPageUptodate(page);
unlock_page(page);
return;
@@ -314,7 +314,7 @@ static void decrypt_bh(struct work_struct *work)
}
/*
- * I/O completion handler for block_read_full_page() - pages
+ * I/O completion handler for block_read_full_folio() - pages
* which come unlocked at the end of I/O.
*/
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
@@ -562,7 +562,7 @@ void write_boundary_block(struct block_device *bdev,
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
- ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
+ ll_rw_block(REQ_OP_WRITE, 1, &bh);
put_bh(bh);
}
}
@@ -955,7 +955,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
size);
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page_folio(page)))
goto failed;
}
@@ -1060,8 +1060,8 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* Also. When blockdev buffers are explicitly read with bread(), they
* individually become uptodate. But their backing page remains not
* uptodate - even if all of its buffers are uptodate. A subsequent
- * block_read_full_page() against that page will discover all the uptodate
- * buffers, will set the page uptodate and will perform no I/O.
+ * block_read_full_folio() against that folio will discover all the uptodate
+ * buffers, will set the folio uptodate and will perform no I/O.
*/
/**
@@ -1174,7 +1174,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
} else {
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
@@ -1342,7 +1342,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
if (likely(bh)) {
- ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
+ ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
brelse(bh);
}
}
@@ -1353,7 +1353,7 @@ void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
{
struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
if (likely(bh)) {
- ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
+ ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, &bh);
brelse(bh);
}
}
@@ -1604,7 +1604,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
pgoff_t end;
int i, count;
@@ -1612,24 +1612,24 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
struct buffer_head *head;
end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
- pagevec_init(&pvec);
- while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
- count = pagevec_count(&pvec);
+ folio_batch_init(&fbatch);
+ while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
+ count = folio_batch_count(&fbatch);
for (i = 0; i < count; i++) {
- struct page *page = pvec.pages[i];
+ struct folio *folio = fbatch.folios[i];
- if (!page_has_buffers(page))
+ if (!folio_buffers(folio))
continue;
/*
- * We use page lock instead of bd_mapping->private_lock
+ * We use folio lock instead of bd_mapping->private_lock
* to pin buffers here since we can afford to sleep and
* it scales better than a global spinlock lock.
*/
- lock_page(page);
- /* Recheck when the page is locked which pins bhs */
- if (!page_has_buffers(page))
+ folio_lock(folio);
+ /* Recheck when the folio is locked which pins bhs */
+ head = folio_buffers(folio);
+ if (!head)
goto unlock_page;
- head = page_buffers(page);
bh = head;
do {
if (!buffer_mapped(bh) || (bh->b_blocknr < block))
@@ -1643,9 +1643,9 @@ next:
bh = bh->b_this_page;
} while (bh != head);
unlock_page:
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
/* End of range already reached? */
if (index > end || !index)
@@ -1716,7 +1716,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_flags = wbc_to_write_flags(wbc);
+ blk_opf_t write_flags = wbc_to_write_flags(wbc);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1804,7 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -1858,7 +1858,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -2033,7 +2033,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
*wait_bh++=bh;
}
}
@@ -2088,7 +2088,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
/*
* If this is a partial write which happened to make all buffers
- * uptodate then we can optimize away a bogus readpage() for
+ * uptodate then we can optimize away a bogus read_folio() for
* the next read(). Here we 'discover' whether the page went
* uptodate as a result of this (potentially partial) write.
*/
@@ -2104,13 +2104,13 @@ static int __block_commit_write(struct inode *inode, struct page *page,
* The filesystem needs to handle block truncation upon failure.
*/
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
- unsigned flags, struct page **pagep, get_block_t *get_block)
+ struct page **pagep, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -2137,12 +2137,12 @@ int block_write_end(struct file *file, struct address_space *mapping,
if (unlikely(copied < len)) {
/*
- * The buffers that were written will now be uptodate, so we
- * don't have to worry about a readpage reading them and
- * overwriting a partial write. However if we have encountered
- * a short write and only partially written into a buffer, it
- * will not be marked uptodate, so a readpage might come in and
- * destroy our partial write.
+ * The buffers that were written will now be uptodate, so
+ * we don't have to worry about a read_folio reading them
+ * and overwriting a partial write. However if we have
+ * encountered a short write and only partially written
+ * into a buffer, it will not be marked uptodate, so a
+ * read_folio might come in and destroy our partial write.
*
* Do the simplest thing, and just treat any short write to a
* non uptodate page as a zero-length write, and force the
@@ -2245,26 +2245,29 @@ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
EXPORT_SYMBOL(block_is_partially_uptodate);
/*
- * Generic "read page" function for block devices that have the normal
+ * Generic "read_folio" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.
- * Reads the page asynchronously --- the unlock_buffer() and
+ * Reads the folio asynchronously --- the unlock_buffer() and
* set/clear_buffer_uptodate() functions propagate buffer state into the
- * page struct once IO has completed.
+ * folio once IO has completed.
*/
-int block_read_full_page(struct page *page, get_block_t *get_block)
+int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize, bbits;
int nr, i;
int fully_mapped = 1;
+ bool page_error = false;
- head = create_page_buffers(page, inode, 0);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+
+ head = create_page_buffers(&folio->page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
- iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
+ iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
lblock = (i_size_read(inode)+blocksize-1) >> bbits;
bh = head;
nr = 0;
@@ -2281,11 +2284,14 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
if (iblock < lblock) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, iblock, bh, 0);
- if (err)
- SetPageError(page);
+ if (err) {
+ folio_set_error(folio);
+ page_error = true;
+ }
}
if (!buffer_mapped(bh)) {
- zero_user(page, i * blocksize, blocksize);
+ folio_zero_range(folio, i * blocksize,
+ blocksize);
if (!err)
set_buffer_uptodate(bh);
continue;
@@ -2301,16 +2307,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
} while (i++, iblock++, (bh = bh->b_this_page) != head);
if (fully_mapped)
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
if (!nr) {
/*
- * All buffers are uptodate - we can set the page uptodate
+ * All buffers are uptodate - we can set the folio uptodate
* as well. But not if get_block() returned an error.
*/
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
+ if (!page_error)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
@@ -2331,11 +2337,11 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
if (buffer_uptodate(bh))
end_buffer_async_read(bh, 1);
else
- submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
}
return 0;
}
-EXPORT_SYMBOL(block_read_full_page);
+EXPORT_SYMBOL(block_read_full_folio);
/* utility function for filesystems that need to do work on expanding
* truncates. Uses filesystem pagecache writes to allow the filesystem to
@@ -2344,6 +2350,7 @@ EXPORT_SYMBOL(block_read_full_page);
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
struct page *page;
void *fsdata;
int err;
@@ -2352,11 +2359,11 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
if (err)
goto out;
- err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);
+ err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
if (err)
goto out;
- err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
+ err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
BUG_ON(err > 0);
out:
@@ -2368,6 +2375,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
loff_t pos, loff_t *bytes)
{
struct inode *inode = mapping->host;
+ const struct address_space_operations *aops = mapping->a_ops;
unsigned int blocksize = i_blocksize(inode);
struct page *page;
void *fsdata;
@@ -2387,12 +2395,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ err = aops->write_begin(file, mapping, curpos, len,
&page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
- err = pagecache_write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(file, mapping, curpos, len, len,
page, fsdata);
if (err < 0)
goto out;
@@ -2420,12 +2428,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ err = aops->write_begin(file, mapping, curpos, len,
&page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
- err = pagecache_write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(file, mapping, curpos, len, len,
page, fsdata);
if (err < 0)
goto out;
@@ -2441,7 +2449,7 @@ out:
* We may have to extend the file.
*/
int cont_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata,
get_block_t *get_block, loff_t *bytes)
{
@@ -2460,7 +2468,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
(*bytes)++;
}
- return block_write_begin(mapping, pos, len, flags, pagep, get_block);
+ return block_write_begin(mapping, pos, len, pagep, get_block);
}
EXPORT_SYMBOL(cont_write_begin);
@@ -2529,341 +2537,6 @@ out_unlock:
}
EXPORT_SYMBOL(block_page_mkwrite);
-/*
- * nobh_write_begin()'s prereads are special: the buffer_heads are freed
- * immediately, while under the page lock. So it needs a special end_io
- * handler which does not touch the bh after unlocking it.
- */
-static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
-{
- __end_buffer_read_notouch(bh, uptodate);
-}
-
-/*
- * Attach the singly-linked list of buffers created by nobh_write_begin, to
- * the page (converting it to circular linked list and taking care of page
- * dirty races).
- */
-static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
-{
- struct buffer_head *bh;
-
- BUG_ON(!PageLocked(page));
-
- spin_lock(&page->mapping->private_lock);
- bh = head;
- do {
- if (PageDirty(page))
- set_buffer_dirty(bh);
- if (!bh->b_this_page)
- bh->b_this_page = head;
- bh = bh->b_this_page;
- } while (bh != head);
- attach_page_private(page, head);
- spin_unlock(&page->mapping->private_lock);
-}
-
-/*
- * On entry, the page is fully not uptodate.
- * On exit the page is fully uptodate in the areas outside (from,to)
- * The filesystem needs to handle block truncation upon failure.
- */
-int nobh_write_begin(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata,
- get_block_t *get_block)
-{
- struct inode *inode = mapping->host;
- const unsigned blkbits = inode->i_blkbits;
- const unsigned blocksize = 1 << blkbits;
- struct buffer_head *head, *bh;
- struct page *page;
- pgoff_t index;
- unsigned from, to;
- unsigned block_in_page;
- unsigned block_start, block_end;
- sector_t block_in_file;
- int nr_reads = 0;
- int ret = 0;
- int is_mapped_to_disk = 1;
-
- index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
-
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
- *pagep = page;
- *fsdata = NULL;
-
- if (page_has_buffers(page)) {
- ret = __block_write_begin(page, pos, len, get_block);
- if (unlikely(ret))
- goto out_release;
- return ret;
- }
-
- if (PageMappedToDisk(page))
- return 0;
-
- /*
- * Allocate buffers so that we can keep track of state, and potentially
- * attach them to the page if an error occurs. In the common case of
- * no error, they will just be freed again without ever being attached
- * to the page (which is all OK, because we're under the page lock).
- *
- * Be careful: the buffer linked list is a NULL terminated one, rather
- * than the circular one we're used to.
- */
- head = alloc_page_buffers(page, blocksize, false);
- if (!head) {
- ret = -ENOMEM;
- goto out_release;
- }
-
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
-
- /*
- * We loop across all blocks in the page, whether or not they are
- * part of the affected region. This is so we can discover if the
- * page is fully mapped-to-disk.
- */
- for (block_start = 0, block_in_page = 0, bh = head;
- block_start < PAGE_SIZE;
- block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
- int create;
-
- block_end = block_start + blocksize;
- bh->b_state = 0;
- create = 1;
- if (block_start >= to)
- create = 0;
- ret = get_block(inode, block_in_file + block_in_page,
- bh, create);
- if (ret)
- goto failed;
- if (!buffer_mapped(bh))
- is_mapped_to_disk = 0;
- if (buffer_new(bh))
- clean_bdev_bh_alias(bh);
- if (PageUptodate(page)) {
- set_buffer_uptodate(bh);
- continue;
- }
- if (buffer_new(bh) || !buffer_mapped(bh)) {
- zero_user_segments(page, block_start, from,
- to, block_end);
- continue;
- }
- if (buffer_uptodate(bh))
- continue; /* reiserfs does this */
- if (block_start < from || block_end > to) {
- lock_buffer(bh);
- bh->b_end_io = end_buffer_read_nobh;
- submit_bh(REQ_OP_READ, 0, bh);
- nr_reads++;
- }
- }
-
- if (nr_reads) {
- /*
- * The page is locked, so these buffers are protected from
- * any VM or truncate activity. Hence we don't need to care
- * for the buffer_head refcounts.
- */
- for (bh = head; bh; bh = bh->b_this_page) {
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- ret = -EIO;
- }
- if (ret)
- goto failed;
- }
-
- if (is_mapped_to_disk)
- SetPageMappedToDisk(page);
-
- *fsdata = head; /* to be released by nobh_write_end */
-
- return 0;
-
-failed:
- BUG_ON(!ret);
- /*
- * Error recovery is a bit difficult. We need to zero out blocks that
- * were newly allocated, and dirty them to ensure they get written out.
- * Buffers need to be attached to the page at this point, otherwise
- * the handling of potential IO errors during writeout would be hard
- * (could try doing synchronous writeout, but what if that fails too?)
- */
- attach_nobh_buffers(page, head);
- page_zero_new_buffers(page, from, to);
-
-out_release:
- unlock_page(page);
- put_page(page);
- *pagep = NULL;
-
- return ret;
-}
-EXPORT_SYMBOL(nobh_write_begin);
-
-int nobh_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = page->mapping->host;
- struct buffer_head *head = fsdata;
- struct buffer_head *bh;
- BUG_ON(fsdata != NULL && page_has_buffers(page));
-
- if (unlikely(copied < len) && head)
- attach_nobh_buffers(page, head);
- if (page_has_buffers(page))
- return generic_write_end(file, mapping, pos, len,
- copied, page, fsdata);
-
- SetPageUptodate(page);
- set_page_dirty(page);
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- unlock_page(page);
- put_page(page);
-
- while (head) {
- bh = head;
- head = head->b_this_page;
- free_buffer_head(bh);
- }
-
- return copied;
-}
-EXPORT_SYMBOL(nobh_write_end);
-
-/*
- * nobh_writepage() - based on block_full_write_page() except
- * that it tries to operate without attaching bufferheads to
- * the page.
- */
-int nobh_writepage(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
-{
- struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_SHIFT;
- unsigned offset;
- int ret;
-
- /* Is the page fully inside i_size? */
- if (page->index < end_index)
- goto out;
-
- /* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_SIZE-1);
- if (page->index >= end_index+1 || !offset) {
- unlock_page(page);
- return 0; /* don't care */
- }
-
- /*
- * The page straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
- * in multiples of the page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- zero_user_segment(page, offset, PAGE_SIZE);
-out:
- ret = mpage_writepage(page, get_block, wbc);
- if (ret == -EAGAIN)
- ret = __block_write_full_page(inode, page, get_block, wbc,
- end_buffer_async_write);
- return ret;
-}
-EXPORT_SYMBOL(nobh_writepage);
-
-int nobh_truncate_page(struct address_space *mapping,
- loff_t from, get_block_t *get_block)
-{
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize;
- sector_t iblock;
- unsigned length, pos;
- struct inode *inode = mapping->host;
- struct page *page;
- struct buffer_head map_bh;
- int err;
-
- blocksize = i_blocksize(inode);
- length = offset & (blocksize - 1);
-
- /* Block boundary? Nothing to do */
- if (!length)
- return 0;
-
- length = blocksize - length;
- iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
-
- page = grab_cache_page(mapping, index);
- err = -ENOMEM;
- if (!page)
- goto out;
-
- if (page_has_buffers(page)) {
-has_buffers:
- unlock_page(page);
- put_page(page);
- return block_truncate_page(mapping, from, get_block);
- }
-
- /* Find the buffer that contains "offset" */
- pos = blocksize;
- while (offset >= pos) {
- iblock++;
- pos += blocksize;
- }
-
- map_bh.b_size = blocksize;
- map_bh.b_state = 0;
- err = get_block(inode, iblock, &map_bh, 0);
- if (err)
- goto unlock;
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(&map_bh))
- goto unlock;
-
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (!PageUptodate(page)) {
- err = mapping->a_ops->readpage(NULL, page);
- if (err) {
- put_page(page);
- goto out;
- }
- lock_page(page);
- if (!PageUptodate(page)) {
- err = -EIO;
- goto unlock;
- }
- if (page_has_buffers(page))
- goto has_buffers;
- }
- zero_user(page, offset, length);
- set_page_dirty(page);
- err = 0;
-
-unlock:
- unlock_page(page);
- put_page(page);
-out:
- return err;
-}
-EXPORT_SYMBOL(nobh_truncate_page);
-
int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
@@ -2921,7 +2594,7 @@ int block_truncate_page(struct address_space *mapping,
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
err = -EIO;
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
wait_on_buffer(bh);
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
@@ -3000,9 +2673,10 @@ static void end_bio_bh_io_sync(struct bio *bio)
bio_put(bio);
}
-static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc)
+static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ struct writeback_control *wbc)
{
+ const enum req_op op = opf & REQ_OP_MASK;
struct bio *bio;
BUG_ON(!buffer_locked(bh));
@@ -3018,11 +2692,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
clear_buffer_write_io_error(bh);
if (buffer_meta(bh))
- op_flags |= REQ_META;
+ opf |= REQ_META;
if (buffer_prio(bh))
- op_flags |= REQ_PRIO;
+ opf |= REQ_PRIO;
- bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
+ bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
@@ -3043,25 +2717,23 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
}
submit_bio(bio);
- return 0;
}
-int submit_bh(int op, int op_flags, struct buffer_head *bh)
+void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, NULL);
+ submit_bh_wbc(opf, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
- * @op: whether to %READ or %WRITE
- * @op_flags: req_flag_bits
+ * @opf: block layer request operation and flags.
* @nr: number of &struct buffer_heads in the array
* @bhs: array of pointers to &struct buffer_head
*
* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
* requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
- * @op_flags contains flags modifying the detailed I/O behavior, most notably
+ * @opf contains flags modifying the detailed I/O behavior, most notably
* %REQ_RAHEAD.
*
* This function drops any buffer that it cannot get a lock on (with the
@@ -3078,8 +2750,9 @@ EXPORT_SYMBOL(submit_bh);
* All of the buffers must be for the same device, and must also be a
* multiple of the current approved size for the device.
*/
-void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[])
+void ll_rw_block(const blk_opf_t opf, int nr, struct buffer_head *bhs[])
{
+ const enum req_op op = opf & REQ_OP_MASK;
int i;
for (i = 0; i < nr; i++) {
@@ -3087,18 +2760,18 @@ void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[])
if (!trylock_buffer(bh))
continue;
- if (op == WRITE) {
+ if (op == REQ_OP_WRITE) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(op, op_flags, bh);
+ submit_bh(opf, bh);
continue;
}
} else {
if (!buffer_uptodate(bh)) {
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(op, op_flags, bh);
+ submit_bh(opf, bh);
continue;
}
}
@@ -3107,7 +2780,7 @@ void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[])
}
EXPORT_SYMBOL(ll_rw_block);
-void write_dirty_buffer(struct buffer_head *bh, int op_flags)
+void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
lock_buffer(bh);
if (!test_clear_buffer_dirty(bh)) {
@@ -3116,7 +2789,7 @@ void write_dirty_buffer(struct buffer_head *bh, int op_flags)
}
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(REQ_OP_WRITE, op_flags, bh);
+ submit_bh(REQ_OP_WRITE | op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);
@@ -3125,10 +2798,8 @@ EXPORT_SYMBOL(write_dirty_buffer);
* and then start new I/O and then wait upon it. The caller must have a ref on
* the buffer_head.
*/
-int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
+int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
- int ret = 0;
-
WARN_ON(atomic_read(&bh->b_count) < 1);
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
@@ -3143,14 +2814,14 @@ int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
+ submit_bh(REQ_OP_WRITE | op_flags, bh);
wait_on_buffer(bh);
- if (!ret && !buffer_uptodate(bh))
- ret = -EIO;
+ if (!buffer_uptodate(bh))
+ return -EIO;
} else {
unlock_buffer(bh);
}
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__sync_dirty_buffer);
@@ -3161,20 +2832,20 @@ int sync_dirty_buffer(struct buffer_head *bh)
EXPORT_SYMBOL(sync_dirty_buffer);
/*
- * try_to_free_buffers() checks if all the buffers on this particular page
+ * try_to_free_buffers() checks if all the buffers on this particular folio
* are unused, and releases them if so.
*
* Exclusion against try_to_free_buffers may be obtained by either
- * locking the page or by holding its mapping's private_lock.
+ * locking the folio or by holding its mapping's private_lock.
*
- * If the page is dirty but all the buffers are clean then we need to
- * be sure to mark the page clean as well. This is because the page
+ * If the folio is dirty but all the buffers are clean then we need to
+ * be sure to mark the folio clean as well. This is because the folio
* may be against a block device, and a later reattachment of buffers
- * to a dirty page will set *all* buffers dirty. Which would corrupt
+ * to a dirty folio will set *all* buffers dirty. Which would corrupt
* filesystem data on the same device.
*
- * The same applies to regular filesystem pages: if all the buffers are
- * clean then we set the page clean and proceed. To do that, we require
+ * The same applies to regular filesystem folios: if all the buffers are
+ * clean then we set the folio clean and proceed. To do that, we require
* total exclusion from block_dirty_folio(). That is obtained with
* private_lock.
*
@@ -3186,10 +2857,10 @@ static inline int buffer_busy(struct buffer_head *bh)
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
-static int
-drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
+static bool
+drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
{
- struct buffer_head *head = page_buffers(page);
+ struct buffer_head *head = folio_buffers(folio);
struct buffer_head *bh;
bh = head;
@@ -3207,46 +2878,46 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
bh = next;
} while (bh != head);
*buffers_to_free = head;
- detach_page_private(page);
- return 1;
+ folio_detach_private(folio);
+ return true;
failed:
- return 0;
+ return false;
}
-int try_to_free_buffers(struct page *page)
+bool try_to_free_buffers(struct folio *folio)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = folio->mapping;
struct buffer_head *buffers_to_free = NULL;
- int ret = 0;
+ bool ret = 0;
- BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
- return 0;
+ BUG_ON(!folio_test_locked(folio));
+ if (folio_test_writeback(folio))
+ return false;
if (mapping == NULL) { /* can this still happen? */
- ret = drop_buffers(page, &buffers_to_free);
+ ret = drop_buffers(folio, &buffers_to_free);
goto out;
}
spin_lock(&mapping->private_lock);
- ret = drop_buffers(page, &buffers_to_free);
+ ret = drop_buffers(folio, &buffers_to_free);
/*
* If the filesystem writes its buffers by hand (eg ext3)
- * then we can have clean buffers against a dirty page. We
- * clean the page here; otherwise the VM will never notice
+ * then we can have clean buffers against a dirty folio. We
+ * clean the folio here; otherwise the VM will never notice
* that the filesystem did any IO at all.
*
* Also, during truncate, discard_buffer will have marked all
- * the page's buffers clean. We discover that here and clean
- * the page also.
+ * the folio's buffers clean. We discover that here and clean
+ * the folio also.
*
* private_lock must be held over this entire operation in order
* to synchronise against block_dirty_folio and prevent the
* dirty bit from being lost.
*/
if (ret)
- cancel_dirty_page(page);
+ folio_cancel_dirty(folio);
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {
@@ -3371,7 +3042,7 @@ int bh_submit_read(struct buffer_head *bh)
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return 0;
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 719faeeda168..8df715640a48 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -26,3 +26,15 @@ config CACHEFILES_ERROR_INJECTION
help
This permits error injection to be enabled in cachefiles whilst a
cache is in service.
+
+config CACHEFILES_ONDEMAND
+ bool "Support for on-demand read"
+ depends on CACHEFILES
+ default n
+ help
+ This permits userspace to enable the cachefiles on-demand read mode.
+ In this mode, when a cache miss occurs, responsibility for fetching
+ the data lies with the cachefiles backend instead of with the netfs
+ and is delegated to userspace.
+
+ If unsure, say N.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 16d811f1a2fa..c37a7a9af10b 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -16,5 +16,6 @@ cachefiles-y := \
xattr.o
cachefiles-$(CONFIG_CACHEFILES_ERROR_INJECTION) += error_inject.o
+cachefiles-$(CONFIG_CACHEFILES_ONDEMAND) += ondemand.o
obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 7ac04ee2c0a0..aa4efcabb5e3 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -75,6 +75,9 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
{ "inuse", cachefiles_daemon_inuse },
{ "secctx", cachefiles_daemon_secctx },
{ "tag", cachefiles_daemon_tag },
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+ { "copen", cachefiles_ondemand_copen },
+#endif
{ "", NULL }
};
@@ -108,6 +111,9 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
INIT_LIST_HEAD(&cache->volumes);
INIT_LIST_HEAD(&cache->object_list);
spin_lock_init(&cache->object_list_lock);
+ refcount_set(&cache->unbind_pincount, 1);
+ xa_init_flags(&cache->reqs, XA_FLAGS_ALLOC);
+ xa_init_flags(&cache->ondemand_ids, XA_FLAGS_ALLOC1);
/* set default caching limits
* - limit at 1% free space and/or free files
@@ -126,6 +132,53 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
return 0;
}
+static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+{
+ struct xarray *xa = &cache->reqs;
+ struct cachefiles_req *req;
+ unsigned long index;
+
+ /*
+ * Make sure the following two operations won't be reordered.
+ * 1) set CACHEFILES_DEAD bit
+ * 2) flush requests in the xarray
+ * Otherwise the request may be enqueued after xarray has been
+ * flushed, leaving the orphan request never being completed.
+ *
+ * CPU 1 CPU 2
+ * ===== =====
+ * flush requests in the xarray
+ * test CACHEFILES_DEAD bit
+ * enqueue the request
+ * set CACHEFILES_DEAD bit
+ */
+ smp_mb();
+
+ xa_lock(xa);
+ xa_for_each(xa, index, req) {
+ req->error = -EIO;
+ complete(&req->done);
+ }
+ xa_unlock(xa);
+
+ xa_destroy(&cache->reqs);
+ xa_destroy(&cache->ondemand_ids);
+}
+
+void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache)
+{
+ if (refcount_dec_and_test(&cache->unbind_pincount)) {
+ cachefiles_daemon_unbind(cache);
+ cachefiles_open = 0;
+ kfree(cache);
+ }
+}
+
+void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache)
+{
+ refcount_inc(&cache->unbind_pincount);
+}
+
/*
* Release a cache.
*/
@@ -139,36 +192,27 @@ static int cachefiles_daemon_release(struct inode *inode, struct file *file)
set_bit(CACHEFILES_DEAD, &cache->flags);
- cachefiles_daemon_unbind(cache);
+ if (cachefiles_in_ondemand_mode(cache))
+ cachefiles_flush_reqs(cache);
/* clean up the control file interface */
cache->cachefilesd = NULL;
file->private_data = NULL;
- cachefiles_open = 0;
- kfree(cache);
+ cachefiles_put_unbind_pincount(cache);
_leave("");
return 0;
}
-/*
- * Read the cache state.
- */
-static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
- size_t buflen, loff_t *pos)
+static ssize_t cachefiles_do_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
{
- struct cachefiles_cache *cache = file->private_data;
unsigned long long b_released;
unsigned f_released;
char buffer[256];
int n;
- //_enter(",,%zu,", buflen);
-
- if (!test_bit(CACHEFILES_READY, &cache->flags))
- return 0;
-
/* check how much space the cache has */
cachefiles_has_space(cache, 0, 0, cachefiles_has_space_check);
@@ -207,6 +251,25 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
}
/*
+ * Read the cache state.
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+ size_t buflen, loff_t *pos)
+{
+ struct cachefiles_cache *cache = file->private_data;
+
+ //_enter(",,%zu,", buflen);
+
+ if (!test_bit(CACHEFILES_READY, &cache->flags))
+ return 0;
+
+ if (cachefiles_in_ondemand_mode(cache))
+ return cachefiles_ondemand_daemon_read(cache, _buffer, buflen);
+ else
+ return cachefiles_do_daemon_read(cache, _buffer, buflen);
+}
+
+/*
* Take a command from cachefilesd, parse it and act on it.
*/
static ssize_t cachefiles_daemon_write(struct file *file,
@@ -297,8 +360,13 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
poll_wait(file, &cache->daemon_pollwq, poll);
mask = 0;
- if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
- mask |= EPOLLIN;
+ if (cachefiles_in_ondemand_mode(cache)) {
+ if (!xa_empty(&cache->reqs))
+ mask |= EPOLLIN;
+ } else {
+ if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+ mask |= EPOLLIN;
+ }
if (test_bit(CACHEFILES_CULLING, &cache->flags))
mask |= EPOLLOUT;
@@ -687,11 +755,6 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
cache->brun_percent >= 100)
return -ERANGE;
- if (*args) {
- pr_err("'bind' command doesn't take an argument\n");
- return -EINVAL;
- }
-
if (!cache->rootdirname) {
pr_err("No cache directory specified\n");
return -EINVAL;
@@ -703,6 +766,18 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
return -EBUSY;
}
+ if (IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND)) {
+ if (!strcmp(args, "ondemand")) {
+ set_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+ } else if (*args) {
+ pr_err("Invalid argument to the 'bind' command\n");
+ return -EINVAL;
+ }
+ } else if (*args) {
+ pr_err("'bind' command doesn't take an argument\n");
+ return -EINVAL;
+ }
+
/* Make sure we have copies of the tag string */
if (!cache->tag) {
/*
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index ae93cee9d25d..a69073a1d3f0 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -362,6 +362,8 @@ static void cachefiles_withdraw_cookie(struct fscache_cookie *cookie)
spin_unlock(&cache->object_list_lock);
}
+ cachefiles_ondemand_clean_object(object);
+
if (object->file) {
cachefiles_begin_secure(cache, &saved_cred);
cachefiles_clean_up_object(object, cache);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index c793d33b0224..2ad58c465208 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -15,6 +15,8 @@
#include <linux/fscache-cache.h>
#include <linux/cred.h>
#include <linux/security.h>
+#include <linux/xarray.h>
+#include <linux/cachefiles.h>
#define CACHEFILES_DIO_BLOCK_SIZE 4096
@@ -58,8 +60,13 @@ struct cachefiles_object {
enum cachefiles_content content_info:8; /* Info about content presence */
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+ int ondemand_id;
+#endif
};
+#define CACHEFILES_ONDEMAND_ID_CLOSED -1
+
/*
* Cache files cache definition
*/
@@ -98,11 +105,32 @@ struct cachefiles_cache {
#define CACHEFILES_DEAD 1 /* T if cache dead */
#define CACHEFILES_CULLING 2 /* T if cull engaged */
#define CACHEFILES_STATE_CHANGED 3 /* T if state changed (poll trigger) */
+#define CACHEFILES_ONDEMAND_MODE 4 /* T if in on-demand read mode */
char *rootdirname; /* name of cache root directory */
char *secctx; /* LSM security context */
char *tag; /* cache binding tag */
+ refcount_t unbind_pincount;/* refcount to do daemon unbind */
+ struct xarray reqs; /* xarray of pending on-demand requests */
+ unsigned long req_id_next;
+ struct xarray ondemand_ids; /* xarray for ondemand_id allocation */
+ u32 ondemand_id_next;
+};
+
+static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
+{
+ return IS_ENABLED(CONFIG_CACHEFILES_ONDEMAND) &&
+ test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags);
+}
+
+struct cachefiles_req {
+ struct cachefiles_object *object;
+ struct completion done;
+ int error;
+ struct cachefiles_msg msg;
};
+#define CACHEFILES_REQ_NEW XA_MARK_1
+
#include <trace/events/cachefiles.h>
static inline
@@ -145,6 +173,8 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
* daemon.c
*/
extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
+extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
/*
* error_inject.c
@@ -201,6 +231,16 @@ extern void cachefiles_put_object(struct cachefiles_object *object,
*/
extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
enum fscache_want_state want_state);
+extern int __cachefiles_prepare_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t *_start, size_t *_len,
+ bool no_space_allocated_yet);
+extern int __cachefiles_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv);
/*
* key.c
@@ -241,6 +281,45 @@ extern bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
struct cachefiles_object *object);
/*
+ * ondemand.c
+ */
+#ifdef CONFIG_CACHEFILES_ONDEMAND
+extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen);
+
+extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
+ char *args);
+
+extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
+extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
+
+extern int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len);
+
+#else
+static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+ return 0;
+}
+
+static inline void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+}
+
+static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+/*
* security.c
*/
extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 9dc81e781f2b..000a28f46e59 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -277,36 +277,33 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
/*
* Initiate a write to the cache.
*/
-static int cachefiles_write(struct netfs_cache_resources *cres,
- loff_t start_pos,
- struct iov_iter *iter,
- netfs_io_terminated_t term_func,
- void *term_func_priv)
+int __cachefiles_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
{
- struct cachefiles_object *object;
struct cachefiles_cache *cache;
struct cachefiles_kiocb *ki;
struct inode *inode;
- struct file *file;
unsigned int old_nofs;
- ssize_t ret = -ENOBUFS;
+ ssize_t ret;
size_t len = iov_iter_count(iter);
- if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
- goto presubmission_error;
fscache_count_write();
- object = cachefiles_cres_object(cres);
cache = object->volume->cache;
- file = cachefiles_cres_file(cres);
_enter("%pD,%li,%llx,%zx/%llx",
file, file_inode(file)->i_ino, start_pos, len,
i_size_read(file_inode(file)));
- ret = -ENOMEM;
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
- if (!ki)
- goto presubmission_error;
+ if (!ki) {
+ if (term_func)
+ term_func(term_func_priv, -ENOMEM, false);
+ return -ENOMEM;
+ }
refcount_set(&ki->ki_refcnt, 2);
ki->iocb.ki_filp = file;
@@ -314,7 +311,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
ki->iocb.ki_ioprio = get_current_ioprio();
ki->object = object;
- ki->inval_counter = cres->inval_counter;
ki->start = start_pos;
ki->len = len;
ki->term_func = term_func;
@@ -369,11 +365,24 @@ in_progress:
cachefiles_put_kiocb(ki);
_leave(" = %zd", ret);
return ret;
+}
-presubmission_error:
- if (term_func)
- term_func(term_func_priv, ret, false);
- return ret;
+static int cachefiles_write(struct netfs_cache_resources *cres,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
+{
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
+ if (term_func)
+ term_func(term_func_priv, -ENOBUFS, false);
+ return -ENOBUFS;
+ }
+
+ return __cachefiles_write(cachefiles_cres_object(cres),
+ cachefiles_cres_file(cres),
+ start_pos, iter,
+ term_func, term_func_priv);
}
/*
@@ -394,6 +403,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
+ int rc;
_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
@@ -406,7 +416,8 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
why = cachefiles_trace_read_no_data;
- goto out_no_object;
+ if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+ goto out_no_object;
}
/* The object and the file may be being created in the background. */
@@ -423,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
object = cachefiles_cres_object(cres);
cache = object->volume->cache;
cachefiles_begin_secure(cache, &saved_cred);
-
+retry:
off = cachefiles_inject_read_error();
if (off == 0)
off = vfs_llseek(file, subreq->start, SEEK_DATA);
@@ -474,6 +485,15 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
download_and_store:
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
+ rc = cachefiles_ondemand_read(object, subreq->start,
+ subreq->len);
+ if (!rc) {
+ __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+ goto retry;
+ }
+ ret = NETFS_INVALID_READ;
+ }
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
@@ -484,13 +504,12 @@ out_no_object:
/*
* Prepare for a write to occur.
*/
-static int __cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet)
+int __cachefiles_prepare_write(struct cachefiles_object *object,
+ struct file *file,
+ loff_t *_start, size_t *_len,
+ bool no_space_allocated_yet)
{
- struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
- struct file *file = cachefiles_cres_file(cres);
loff_t start = *_start, pos;
size_t len = *_len, down;
int ret;
@@ -577,7 +596,8 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
}
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(cres, _start, _len, i_size,
+ ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
+ _start, _len,
no_space_allocated_yet);
cachefiles_end_secure(cache, saved_cred);
return ret;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f256c8aff7bb..facf2ebe464b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -57,6 +57,16 @@ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
trace_cachefiles_mark_inactive(object, inode);
}
+static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object,
+ struct dentry *dentry)
+{
+ struct inode *inode = d_backing_inode(dentry);
+
+ inode_lock(inode);
+ __cachefiles_unmark_inode_in_use(object, dentry);
+ inode_unlock(inode);
+}
+
/*
* Unmark a backing inode and tell cachefilesd that there's something that can
* be culled.
@@ -68,9 +78,7 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object,
struct inode *inode = file_inode(file);
if (inode) {
- inode_lock(inode);
- __cachefiles_unmark_inode_in_use(object, file->f_path.dentry);
- inode_unlock(inode);
+ cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry);
if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
atomic_long_add(inode->i_blocks, &cache->b_released);
@@ -444,10 +452,9 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
struct dentry *fan = volume->fanout[(u8)object->cookie->key_hash];
struct file *file;
struct path path;
- uint64_t ni_size = object->cookie->object_size;
+ uint64_t ni_size;
long ret;
- ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
cachefiles_begin_secure(cache, &saved_cred);
@@ -473,6 +480,15 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
goto out_dput;
}
+ ret = cachefiles_ondemand_init_object(object);
+ if (ret < 0) {
+ file = ERR_PTR(ret);
+ goto out_unuse;
+ }
+
+ ni_size = object->cookie->object_size;
+ ni_size = round_up(ni_size, CACHEFILES_DIO_BLOCK_SIZE);
+
if (ni_size > 0) {
trace_cachefiles_trunc(object, d_backing_inode(path.dentry), 0, ni_size,
cachefiles_trunc_expand_tmpfile);
@@ -484,7 +500,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
object, d_backing_inode(path.dentry), ret,
cachefiles_trace_trunc_error);
file = ERR_PTR(ret);
- goto out_dput;
+ goto out_unuse;
}
}
@@ -494,15 +510,20 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry),
PTR_ERR(file),
cachefiles_trace_open_error);
- goto out_dput;
+ goto out_unuse;
}
if (unlikely(!file->f_op->read_iter) ||
unlikely(!file->f_op->write_iter)) {
fput(file);
pr_notice("Cache does not support read_iter and write_iter\n");
file = ERR_PTR(-EINVAL);
+ goto out_unuse;
}
+ goto out_dput;
+
+out_unuse:
+ cachefiles_do_unmark_inode_in_use(object, path.dentry);
out_dput:
dput(path.dentry);
out:
@@ -573,6 +594,10 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
}
_debug("file -> %pd positive", dentry);
+ ret = cachefiles_ondemand_init_object(object);
+ if (ret < 0)
+ goto error_fput;
+
ret = cachefiles_check_auxdata(object, file);
if (ret < 0)
goto check_failed;
@@ -590,14 +615,16 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
check_failed:
fscache_cookie_lookup_negative(object->cookie);
cachefiles_unmark_inode_in_use(object, file);
- if (ret == -ESTALE) {
- fput(file);
- dput(dentry);
+ fput(file);
+ dput(dentry);
+ if (ret == -ESTALE)
return cachefiles_create_file(object);
- }
+ return false;
+
error_fput:
fput(file);
error:
+ cachefiles_do_unmark_inode_in_use(object, dentry);
dput(dentry);
return false;
}
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
new file mode 100644
index 000000000000..0254ed39f68c
--- /dev/null
+++ b/fs/cachefiles/ondemand.c
@@ -0,0 +1,514 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/fdtable.h>
+#include <linux/anon_inodes.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static int cachefiles_ondemand_fd_release(struct inode *inode,
+ struct file *file)
+{
+ struct cachefiles_object *object = file->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ int object_id = object->ondemand_id;
+ struct cachefiles_req *req;
+ XA_STATE(xas, &cache->reqs, 0);
+
+ xa_lock(&cache->reqs);
+ object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+
+ /*
+ * Flush all pending READ requests since their completion depends on
+ * anon_fd.
+ */
+ xas_for_each(&xas, req, ULONG_MAX) {
+ if (req->msg.object_id == object_id &&
+ req->msg.opcode == CACHEFILES_OP_READ) {
+ req->error = -EIO;
+ complete(&req->done);
+ xas_store(&xas, NULL);
+ }
+ }
+ xa_unlock(&cache->reqs);
+
+ xa_erase(&cache->ondemand_ids, object_id);
+ trace_cachefiles_ondemand_fd_release(object, object_id);
+ cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+ cachefiles_put_unbind_pincount(cache);
+ return 0;
+}
+
+static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
+ struct iov_iter *iter)
+{
+ struct cachefiles_object *object = kiocb->ki_filp->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct file *file = object->file;
+ size_t len = iter->count;
+ loff_t pos = kiocb->ki_pos;
+ const struct cred *saved_cred;
+ int ret;
+
+ if (!file)
+ return -ENOBUFS;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+ cachefiles_end_secure(cache, saved_cred);
+ if (ret < 0)
+ return ret;
+
+ trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
+ ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
+ if (!ret)
+ ret = len;
+
+ return ret;
+}
+
+static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
+ int whence)
+{
+ struct cachefiles_object *object = filp->private_data;
+ struct file *file = object->file;
+
+ if (!file)
+ return -ENOBUFS;
+
+ return vfs_llseek(file, pos, whence);
+}
+
+static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct cachefiles_object *object = filp->private_data;
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct cachefiles_req *req;
+ unsigned long id;
+
+ if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
+ return -EINVAL;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ id = arg;
+ req = xa_erase(&cache->reqs, id);
+ if (!req)
+ return -EINVAL;
+
+ trace_cachefiles_ondemand_cread(object, id);
+ complete(&req->done);
+ return 0;
+}
+
+static const struct file_operations cachefiles_ondemand_fd_fops = {
+ .owner = THIS_MODULE,
+ .release = cachefiles_ondemand_fd_release,
+ .write_iter = cachefiles_ondemand_fd_write_iter,
+ .llseek = cachefiles_ondemand_fd_llseek,
+ .unlocked_ioctl = cachefiles_ondemand_fd_ioctl,
+};
+
+/*
+ * OPEN request Completion (copen)
+ * - command: "copen <id>,<cache_size>"
+ * <cache_size> indicates the object size if >=0, error code if negative
+ */
+int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
+{
+ struct cachefiles_req *req;
+ struct fscache_cookie *cookie;
+ char *pid, *psize;
+ unsigned long id;
+ long size;
+ int ret;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ if (!*args) {
+ pr_err("Empty id specified\n");
+ return -EINVAL;
+ }
+
+ pid = args;
+ psize = strchr(args, ',');
+ if (!psize) {
+ pr_err("Cache size is not specified\n");
+ return -EINVAL;
+ }
+
+ *psize = 0;
+ psize++;
+
+ ret = kstrtoul(pid, 0, &id);
+ if (ret)
+ return ret;
+
+ req = xa_erase(&cache->reqs, id);
+ if (!req)
+ return -EINVAL;
+
+ /* fail OPEN request if copen format is invalid */
+ ret = kstrtol(psize, 0, &size);
+ if (ret) {
+ req->error = ret;
+ goto out;
+ }
+
+ /* fail OPEN request if daemon reports an error */
+ if (size < 0) {
+ if (!IS_ERR_VALUE(size)) {
+ req->error = -EINVAL;
+ ret = -EINVAL;
+ } else {
+ req->error = size;
+ ret = 0;
+ }
+ goto out;
+ }
+
+ cookie = req->object->cookie;
+ cookie->object_size = size;
+ if (size)
+ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ else
+ set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+ trace_cachefiles_ondemand_copen(req->object, id, size);
+
+out:
+ complete(&req->done);
+ return ret;
+}
+
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct cachefiles_open *load;
+ struct file *file;
+ u32 object_id;
+ int ret, fd;
+
+ object = cachefiles_grab_object(req->object,
+ cachefiles_obj_get_ondemand_fd);
+ cache = object->volume->cache;
+
+ ret = xa_alloc_cyclic(&cache->ondemand_ids, &object_id, NULL,
+ XA_LIMIT(1, INT_MAX),
+ &cache->ondemand_id_next, GFP_KERNEL);
+ if (ret < 0)
+ goto err;
+
+ fd = get_unused_fd_flags(O_WRONLY);
+ if (fd < 0) {
+ ret = fd;
+ goto err_free_id;
+ }
+
+ file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops,
+ object, O_WRONLY);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto err_put_fd;
+ }
+
+ file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
+ fd_install(fd, file);
+
+ load = (void *)req->msg.data;
+ load->fd = fd;
+ req->msg.object_id = object_id;
+ object->ondemand_id = object_id;
+
+ cachefiles_get_unbind_pincount(cache);
+ trace_cachefiles_ondemand_open(object, &req->msg, load);
+ return 0;
+
+err_put_fd:
+ put_unused_fd(fd);
+err_free_id:
+ xa_erase(&cache->ondemand_ids, object_id);
+err:
+ cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
+ return ret;
+}
+
+ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
+ char __user *_buffer, size_t buflen)
+{
+ struct cachefiles_req *req;
+ struct cachefiles_msg *msg;
+ unsigned long id = 0;
+ size_t n;
+ int ret = 0;
+ XA_STATE(xas, &cache->reqs, cache->req_id_next);
+
+ /*
+ * Cyclically search for a request that has not ever been processed,
+ * to prevent requests from being processed repeatedly, and make
+ * request distribution fair.
+ */
+ xa_lock(&cache->reqs);
+ req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW);
+ if (!req && cache->req_id_next > 0) {
+ xas_set(&xas, 0);
+ req = xas_find_marked(&xas, cache->req_id_next - 1, CACHEFILES_REQ_NEW);
+ }
+ if (!req) {
+ xa_unlock(&cache->reqs);
+ return 0;
+ }
+
+ msg = &req->msg;
+ n = msg->len;
+
+ if (n > buflen) {
+ xa_unlock(&cache->reqs);
+ return -EMSGSIZE;
+ }
+
+ xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
+ cache->req_id_next = xas.xa_index + 1;
+ xa_unlock(&cache->reqs);
+
+ id = xas.xa_index;
+ msg->msg_id = id;
+
+ if (msg->opcode == CACHEFILES_OP_OPEN) {
+ ret = cachefiles_ondemand_get_fd(req);
+ if (ret)
+ goto error;
+ }
+
+ if (copy_to_user(_buffer, msg, n) != 0) {
+ ret = -EFAULT;
+ goto err_put_fd;
+ }
+
+ /* CLOSE request has no reply */
+ if (msg->opcode == CACHEFILES_OP_CLOSE) {
+ xa_erase(&cache->reqs, id);
+ complete(&req->done);
+ }
+
+ return n;
+
+err_put_fd:
+ if (msg->opcode == CACHEFILES_OP_OPEN)
+ close_fd(((struct cachefiles_open *)msg->data)->fd);
+error:
+ xa_erase(&cache->reqs, id);
+ req->error = ret;
+ complete(&req->done);
+ return ret;
+}
+
+typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
+
+static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
+ enum cachefiles_opcode opcode,
+ size_t data_len,
+ init_req_fn init_req,
+ void *private)
+{
+ struct cachefiles_cache *cache = object->volume->cache;
+ struct cachefiles_req *req;
+ XA_STATE(xas, &cache->reqs, 0);
+ int ret;
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return 0;
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags))
+ return -EIO;
+
+ req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->object = object;
+ init_completion(&req->done);
+ req->msg.opcode = opcode;
+ req->msg.len = sizeof(struct cachefiles_msg) + data_len;
+
+ ret = init_req(req, private);
+ if (ret)
+ goto out;
+
+ do {
+ /*
+ * Stop enqueuing the request when daemon is dying. The
+ * following two operations need to be atomic as a whole.
+ * 1) check cache state, and
+ * 2) enqueue request if cache is alive.
+ * Otherwise the request may be enqueued after xarray has been
+ * flushed, leaving the orphan request never being completed.
+ *
+ * CPU 1 CPU 2
+ * ===== =====
+ * test CACHEFILES_DEAD bit
+ * set CACHEFILES_DEAD bit
+ * flush requests in the xarray
+ * enqueue the request
+ */
+ xas_lock(&xas);
+
+ if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ xas_unlock(&xas);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* coupled with the barrier in cachefiles_flush_reqs() */
+ smp_mb();
+
+ if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) {
+ WARN_ON_ONCE(object->ondemand_id == 0);
+ xas_unlock(&xas);
+ ret = -EIO;
+ goto out;
+ }
+
+ xas.xa_index = 0;
+ xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART)
+ xas_set_err(&xas, -EBUSY);
+ xas_store(&xas, req);
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ ret = xas_error(&xas);
+ if (ret)
+ goto out;
+
+ wake_up_all(&cache->daemon_pollwq);
+ wait_for_completion(&req->done);
+ ret = req->error;
+out:
+ kfree(req);
+ return ret;
+}
+
+static int cachefiles_ondemand_init_open_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_volume *volume = object->volume->vcookie;
+ struct cachefiles_open *load = (void *)req->msg.data;
+ size_t volume_key_size, cookie_key_size;
+ void *volume_key, *cookie_key;
+
+ /*
+ * Volume key is a NUL-terminated string. key[0] stores strlen() of the
+ * string, followed by the content of the string (excluding '\0').
+ */
+ volume_key_size = volume->key[0] + 1;
+ volume_key = volume->key + 1;
+
+ /* Cookie key is binary data, which is netfs specific. */
+ cookie_key_size = cookie->key_len;
+ cookie_key = fscache_get_key(cookie);
+
+ if (!(object->cookie->advice & FSCACHE_ADV_WANT_CACHE_SIZE)) {
+ pr_err("WANT_CACHE_SIZE is needed for on-demand mode\n");
+ return -EINVAL;
+ }
+
+ load->volume_key_size = volume_key_size;
+ load->cookie_key_size = cookie_key_size;
+ memcpy(load->data, volume_key, volume_key_size);
+ memcpy(load->data + volume_key_size, cookie_key, cookie_key_size);
+
+ return 0;
+}
+
+static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ int object_id = object->ondemand_id;
+
+ /*
+ * It's possible that object id is still 0 if the cookie looking up
+ * phase failed before OPEN request has ever been sent. Also avoid
+ * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means
+ * anon_fd has already been closed.
+ */
+ if (object_id <= 0)
+ return -ENOENT;
+
+ req->msg.object_id = object_id;
+ trace_cachefiles_ondemand_close(object, &req->msg);
+ return 0;
+}
+
+struct cachefiles_read_ctx {
+ loff_t off;
+ size_t len;
+};
+
+static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
+ void *private)
+{
+ struct cachefiles_object *object = req->object;
+ struct cachefiles_read *load = (void *)req->msg.data;
+ struct cachefiles_read_ctx *read_ctx = private;
+ int object_id = object->ondemand_id;
+
+ /* Stop enqueuing requests when daemon has closed anon_fd. */
+ if (object_id <= 0) {
+ WARN_ON_ONCE(object_id == 0);
+ pr_info_once("READ: anonymous fd closed prematurely.\n");
+ return -EIO;
+ }
+
+ req->msg.object_id = object_id;
+ load->off = read_ctx->off;
+ load->len = read_ctx->len;
+ trace_cachefiles_ondemand_read(object, &req->msg, load);
+ return 0;
+}
+
+int cachefiles_ondemand_init_object(struct cachefiles_object *object)
+{
+ struct fscache_cookie *cookie = object->cookie;
+ struct fscache_volume *volume = object->volume->vcookie;
+ size_t volume_key_size, cookie_key_size, data_len;
+
+ /*
+ * CacheFiles will firstly check the cache file under the root cache
+ * directory. If the coherency check failed, it will fallback to
+ * creating a new tmpfile as the cache file. Reuse the previously
+ * allocated object ID if any.
+ */
+ if (object->ondemand_id > 0)
+ return 0;
+
+ volume_key_size = volume->key[0] + 1;
+ cookie_key_size = cookie->key_len;
+ data_len = sizeof(struct cachefiles_open) +
+ volume_key_size + cookie_key_size;
+
+ return cachefiles_ondemand_send_req(object, CACHEFILES_OP_OPEN,
+ data_len, cachefiles_ondemand_init_open_req, NULL);
+}
+
+void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
+{
+ cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
+ cachefiles_ondemand_init_close_req, NULL);
+}
+
+int cachefiles_ondemand_read(struct cachefiles_object *object,
+ loff_t pos, size_t len)
+{
+ struct cachefiles_read_ctx read_ctx = {pos, len};
+
+ return cachefiles_ondemand_send_req(object, CACHEFILES_OP_READ,
+ sizeof(struct cachefiles_read),
+ cachefiles_ondemand_init_read_req, &read_ctx);
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 35465109d9c4..00b087c14995 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -203,7 +203,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
if (!buf)
return false;
buf->reserved = cpu_to_be32(0);
- memcpy(buf->data, p, len);
+ memcpy(buf->data, p, volume->vcookie->coherency_len);
ret = cachefiles_inject_write_error();
if (ret == 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index aa25bffd4823..dcf701b05cc1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -63,7 +63,7 @@
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata);
+ struct folio **foliop, void **_fsdata);
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
@@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
if (folio_test_dirty(folio)) {
dout("%p dirty_folio %p idx %lu -- already dirty\n",
mapping->host, folio, folio->index);
- BUG_ON(!folio_get_private(folio));
+ VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
return false;
}
@@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
* Reference snap context in folio->private. Also set
* PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(folio_get_private(folio));
+ VM_WARN_ON_FOLIO(folio->private, folio);
folio_attach_private(folio, snapc);
return ceph_fscache_dirty_folio(mapping, folio);
@@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
}
WARN_ON(!folio_test_locked(folio));
- if (folio_get_private(folio)) {
+ if (folio_test_private(folio)) {
dout("%p invalidate_folio idx %lu full dirty page\n",
inode, folio->index);
@@ -162,24 +162,24 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
folio_wait_fscache(folio);
}
-static int ceph_releasepage(struct page *page, gfp_t gfp)
+static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
- dout("%llx:%llx releasepage %p idx %lu (%sdirty)\n",
- ceph_vinop(inode), page,
- page->index, PageDirty(page) ? "" : "not ");
+ dout("%llx:%llx release_folio idx %lu (%sdirty)\n",
+ ceph_vinop(inode),
+ folio->index, folio_test_dirty(folio) ? "" : "not ");
- if (PagePrivate(page))
- return 0;
+ if (folio_test_private(folio))
+ return false;
- if (PageFsCache(page)) {
+ if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
- return 0;
- wait_on_page_fscache(page);
+ return false;
+ folio_wait_fscache(folio);
}
ceph_fscache_note_page_release(inode);
- return 1;
+ return true;
}
static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
@@ -237,7 +237,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
if (err >= 0 && err < subreq->len)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, err, true);
+ netfs_subreq_terminated(subreq, err, false);
num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
ceph_put_page_vector(osd_data->pages, num_pages, false);
@@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
struct iov_iter iter;
ssize_t err = 0;
size_t len;
+ int mode;
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
@@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
goto out;
/* We need to fetch the inline data. */
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -311,8 +313,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
int err = 0;
u64 len = subreq->len;
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- ceph_netfs_issue_op_inline(subreq))
+ if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
@@ -327,7 +328,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
- err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
+ err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
goto out;
@@ -336,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
/* should always give us a page-aligned read */
WARN_ON_ONCE(page_off);
len = err;
+ err = 0;
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
req->r_callback = finish_netfs_read;
@@ -343,9 +345,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req->r_inode = inode;
ihold(inode);
- err = ceph_osdc_start_request(req->r_osdc, req, false);
- if (err)
- iput(inode);
+ ceph_osdc_start_request(req->r_osdc, req);
out:
ceph_osdc_put_request(req);
if (err)
@@ -392,11 +392,10 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
return 0;
}
-static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
+static void ceph_netfs_free_request(struct netfs_io_request *rreq)
{
- struct inode *inode = mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int got = (uintptr_t)priv;
+ struct ceph_inode_info *ci = ceph_inode(rreq->inode);
+ int got = (uintptr_t)rreq->netfs_priv;
if (got)
ceph_put_cap_refs(ci, got);
@@ -404,12 +403,12 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
+ .free_request = ceph_netfs_free_request,
.begin_cache_operation = ceph_begin_cache_operation,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
- .cleanup = ceph_readahead_cleanup,
};
#ifdef CONFIG_CEPH_FSCACHE
@@ -604,8 +603,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
true);
- if (IS_ERR(req))
+ if (IS_ERR(req)) {
+ redirty_page_for_writepage(wbc, page);
return PTR_ERR(req);
+ }
set_page_writeback(page);
if (caching)
@@ -618,9 +619,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(osdc, req, true);
- if (!err)
- err = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_start_request(osdc, req);
+ err = ceph_osdc_wait_request(osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
@@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req)
/* clean all pages */
for (i = 0; i < req->r_num_ops; i++) {
- if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+ pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
+ __func__, req->r_ops[i].op, req, i, req->r_tid);
break;
+ }
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
@@ -1145,8 +1148,7 @@ new_request:
}
req->r_mtime = inode->i_mtime;
- rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
- BUG_ON(rc);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
req = NULL;
wbc->nr_to_write -= i;
@@ -1282,18 +1284,19 @@ ceph_find_incompatible(struct page *page)
}
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(folio_page(folio, 0));
+ snapc = ceph_find_incompatible(folio_page(*foliop, 0));
if (snapc) {
int r;
- folio_unlock(folio);
- folio_put(folio);
+ folio_unlock(*foliop);
+ folio_put(*foliop);
+ *foliop = NULL;
if (IS_ERR(snapc))
return PTR_ERR(snapc);
@@ -1311,24 +1314,22 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* clean, or already dirty within the same snap context.
*/
static int ceph_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned aop_flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
int r;
- r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL);
- if (r == 0)
- folio_wait_fscache(folio);
- if (r < 0) {
- if (folio)
- folio_put(folio);
- } else {
- WARN_ON_ONCE(!folio_test_locked(folio));
- *pagep = &folio->page;
- }
- return r;
+ r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
+ if (r < 0)
+ return r;
+
+ folio_wait_fscache(folio);
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ *pagep = &folio->page;
+ return 0;
}
/*
@@ -1372,7 +1373,7 @@ out:
}
const struct address_space_operations ceph_aops = {
- .readpage = netfs_readpage,
+ .read_folio = netfs_read_folio,
.readahead = netfs_readahead,
.writepage = ceph_writepage,
.writepages = ceph_writepages_start,
@@ -1380,7 +1381,7 @@ const struct address_space_operations ceph_aops = {
.write_end = ceph_write_end,
.dirty_folio = ceph_dirty_folio,
.invalidate_folio = ceph_invalidate_folio,
- .releasepage = ceph_releasepage,
+ .release_folio = ceph_release_folio,
.direct_IO = noop_direct_IO,
};
@@ -1431,7 +1432,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
inode, off, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
- ci->i_inline_version == CEPH_INLINE_NONE) {
+ !ceph_has_inline_data(ci)) {
CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
ceph_add_rw_context(fi, &rw_ctx);
ret = filemap_fault(vmf);
@@ -1641,7 +1642,7 @@ int ceph_uninline_data(struct file *file)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_osd_request *req;
+ struct ceph_osd_request *req = NULL;
struct ceph_cap_flush *prealloc_cf;
struct folio *folio = NULL;
u64 inline_version = CEPH_INLINE_NONE;
@@ -1649,10 +1650,23 @@ int ceph_uninline_data(struct file *file)
int err = 0;
u64 len;
+ spin_lock(&ci->i_ceph_lock);
+ inline_version = ci->i_inline_version;
+ spin_unlock(&ci->i_ceph_lock);
+
+ dout("uninline_data %p %llx.%llx inline_version %llu\n",
+ inode, ceph_vinop(inode), inline_version);
+
+ if (inline_version == CEPH_INLINE_NONE)
+ return 0;
+
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
+ if (inline_version == 1) /* initial version, no data */
+ goto out_uninline;
+
folio = read_mapping_folio(inode->i_mapping, 0, file);
if (IS_ERR(folio)) {
err = PTR_ERR(folio);
@@ -1661,17 +1675,6 @@ int ceph_uninline_data(struct file *file)
folio_lock(folio);
- spin_lock(&ci->i_ceph_lock);
- inline_version = ci->i_inline_version;
- spin_unlock(&ci->i_ceph_lock);
-
- dout("uninline_data %p %llx.%llx inline_version %llu\n",
- inode, ceph_vinop(inode), inline_version);
-
- if (inline_version == 1 || /* initial version, no data */
- inline_version == CEPH_INLINE_NONE)
- goto out_unlock;
-
len = i_size_read(inode);
if (len > folio_size(folio))
len = folio_size(folio);
@@ -1686,9 +1689,8 @@ int ceph_uninline_data(struct file *file)
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
goto out_unlock;
@@ -1729,13 +1731,13 @@ int ceph_uninline_data(struct file *file)
}
req->r_mtime = inode->i_mtime;
- err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
+out_uninline:
if (!err) {
int dirty;
@@ -1754,8 +1756,10 @@ out_put_req:
if (err == -ECANCELED)
err = 0;
out_unlock:
- folio_unlock(folio);
- folio_put(folio);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
out:
ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
@@ -1772,9 +1776,8 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- file_accessed(file);
vma->vm_ops = &ceph_vmops;
return 0;
}
@@ -1787,7 +1790,7 @@ enum {
static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
s64 pool, struct ceph_string *pool_ns)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
struct rb_node **p, *parent;
@@ -1900,15 +1903,13 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
0, false, true);
- err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+ ceph_osdc_start_request(&fsc->client->osdc, rd_req);
- wr_req->r_mtime = ci->vfs_inode.i_mtime;
- err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+ wr_req->r_mtime = ci->netfs.inode.i_mtime;
+ ceph_osdc_start_request(&fsc->client->osdc, wr_req);
- if (!err)
- err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
- if (!err2)
- err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index ddea99922073..177d8e8d73fe 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -29,9 +29,9 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
if (!(inode->i_state & I_NEW))
return;
- WARN_ON_ONCE(ci->netfs_ctx.cache);
+ WARN_ON_ONCE(ci->netfs.cache);
- ci->netfs_ctx.cache =
+ ci->netfs.cache =
fscache_acquire_cookie(fsc->fscache, 0,
&ci->i_vino, sizeof(ci->i_vino),
&ci->i_version, sizeof(ci->i_version),
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 7255b790a4c1..dc502daac49a 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- return netfs_i_cookie(&ci->vfs_inode);
+ return netfs_i_cookie(&ci->netfs);
}
static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f1ad6884d4da..53cfe026b3ea 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+ dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
ci->i_hold_caps_max - jiffies);
}
@@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -531,7 +531,7 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if (S_ISREG(ci->vfs_inode.i_mode) &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
(issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
@@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
if (issued & CEPH_CAP_FILE_SHARED)
atomic_inc(&ci->i_shared_gen);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->netfs.inode);
__ceph_dir_clear_complete(ci);
}
}
/* Wipe saved layout if we're losing DIR_CREATE caps */
- if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
!(issued & CEPH_CAP_DIR_CREATE)) {
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
@@ -602,8 +602,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* @ci: inode to be moved
* @session: new auth caps session
*/
-static void change_auth_cap_ses(struct ceph_inode_info *ci,
- struct ceph_mds_session *session)
+void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
{
lockdep_assert_held(&ci->i_ceph_lock);
@@ -771,7 +771,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -797,7 +797,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (!__cap_is_valid(cap))
continue;
dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -844,12 +844,12 @@ static void __touch_cap(struct ceph_cap *cap)
spin_lock(&s->s_cap_lock);
if (!s->s_cap_iterator) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->vfs_inode, cap, s->s_mds);
+ &cap->ci->netfs.inode, cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -867,7 +867,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -879,7 +879,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
continue;
if ((cap->issued & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -891,7 +891,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
have |= cap->issued;
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -919,7 +919,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
int touch)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
int r;
r = __ceph_caps_issued_mask(ci, mask, touch);
@@ -950,7 +950,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int ret;
spin_lock(&ci->i_ceph_lock);
@@ -969,8 +969,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (S_ISREG(ci->vfs_inode.i_mode) &&
- ci->vfs_inode.i_data.nrpages))
+ (S_ISREG(ci->netfs.inode.i_mode) &&
+ ci->netfs.inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
@@ -993,11 +993,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
int want = 0;
/* use used_cutoff here, to keep dir's wanted caps longer */
@@ -1050,7 +1050,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
int __ceph_caps_wanted(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
/* we want EXCL if holding caps of dir ops */
if (w & CEPH_CAP_ANY_DIR_OPS)
w |= CEPH_CAP_FILE_EXCL;
@@ -1116,9 +1116,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
- mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
+ mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1169,7 +1169,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* keep i_snap_realm.
*/
if (ci->i_wr_ref == 0 && ci->i_snap_realm)
- ceph_change_snap_realm(&ci->vfs_inode, NULL);
+ ceph_change_snap_realm(&ci->netfs.inode, NULL);
__cap_delay_cancel(mdsc, ci);
}
@@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- fsc = ceph_inode_to_client(&ci->vfs_inode);
+ fsc = ceph_inode_to_client(&ci->netfs.inode);
WARN_ON_ONCE(ci->i_auth_cap == cap &&
!list_empty(&ci->i_dirty_item) &&
!fsc->blocklisted &&
- !ceph_inode_is_shutdown(&ci->vfs_inode));
+ !ceph_inode_is_shutdown(&ci->netfs.inode));
__ceph_remove_cap(cap, queue_release);
}
@@ -1343,7 +1343,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
int flushing, u64 flush_tid, u64 oldest_flush_tid)
{
struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int held, revoking;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1440,7 +1440,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
struct ceph_msg *msg;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
if (!msg) {
@@ -1528,7 +1528,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
u64 oldest_flush_tid = 0;
@@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
while (first_tid <= last_tid) {
struct ceph_cap *cap = ci->i_auth_cap;
- struct ceph_cap_flush *cf;
+ struct ceph_cap_flush *cf = NULL, *iter;
int ret;
if (!(cap && cap->session == session)) {
@@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
}
ret = -ENOENT;
- list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
- if (cf->tid >= first_tid) {
+ list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
+ if (iter->tid >= first_tid) {
+ cf = iter;
ret = 0;
break;
}
@@ -1621,7 +1622,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
int mds;
@@ -1681,8 +1682,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
int was = ci->i_dirty_caps;
int dirty = 0;
@@ -1695,7 +1696,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return 0;
}
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
ceph_cap_string(mask), ceph_cap_string(was),
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
@@ -1711,7 +1712,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
ci->i_snap_realm->cached_context);
}
dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &session->s_cap_dirty);
@@ -1874,7 +1875,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
bool __ceph_should_report_size(struct ceph_inode_info *ci)
{
- loff_t size = i_size_read(&ci->vfs_inode);
+ loff_t size = i_size_read(&ci->netfs.inode);
/* mds will adjust max size according to the reported size */
if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
return false;
@@ -1899,7 +1900,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
@@ -1910,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct rb_node *p;
bool queue_invalidate = false;
bool tried_invalidate = false;
+ bool queue_writeback = false;
if (session)
ceph_get_mds_session(session);
@@ -1976,14 +1978,15 @@ retry:
}
dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
- " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode),
+ " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
- (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
+ (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
/*
* If we no longer need to hold onto old our caps, and we may
@@ -2062,10 +2065,27 @@ retry:
}
/* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & cap_used) == 0) {
- dout("completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ dout("completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
}
/* want more caps from mds? */
@@ -2135,6 +2155,8 @@ ack:
spin_unlock(&ci->i_ceph_lock);
ceph_put_mds_session(session);
+ if (queue_writeback)
+ ceph_queue_writeback(inode);
if (queue_invalidate)
ceph_queue_invalidate(inode);
}
@@ -2218,9 +2240,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * wait for any unsafe requests to complete.
+ * flush the mdlog and wait for any unsafe requests to complete.
*/
-static int unsafe_request_wait(struct inode *inode)
+static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -2274,6 +2296,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_dirops,
r_unsafe_dir_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -2294,6 +2318,8 @@ retry:
list_for_each_entry(req, &ci->i_unsafe_iops,
r_unsafe_target_item) {
s = req->r_session;
+ if (!s)
+ continue;
if (unlikely(s->s_mds >= max_sessions)) {
spin_unlock(&ci->i_unsafe_lock);
for (i = 0; i < max_sessions; i++) {
@@ -2332,7 +2358,7 @@ retry:
kfree(sessions);
}
- dout("unsafe_request_wait %p wait on tid %llu %llu\n",
+ dout("%s %p wait on tid %llu %llu\n", __func__,
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
ret = !wait_for_completion_timeout(&req1->r_safe_completion,
@@ -2376,7 +2402,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- err = unsafe_request_wait(inode);
+ err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
/*
* only wait on non-file metadata writeback (the mds
@@ -2442,7 +2468,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
int ret;
@@ -2535,7 +2561,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2585,7 +2611,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2605,7 +2631,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
lockdep_assert_held(&ci->i_ceph_lock);
- dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
+ dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2648,10 +2674,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
}
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->vfs_inode);
+ ihold(&ci->netfs.inode);
ci->i_wb_ref++;
dout("%s %p wb %d -> %d (?)\n", __func__,
- &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -2979,8 +3005,8 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
return ret;
}
- if (S_ISREG(ci->vfs_inode.i_mode) &&
- ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
+ ceph_has_inline_data(ci) &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
@@ -3069,7 +3095,7 @@ enum put_cap_refs_mode {
static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
enum put_cap_refs_mode mode)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
bool check_flushsnaps = false;
@@ -3177,11 +3203,10 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap = NULL;
+ struct inode *inode = &ci->netfs.inode;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0;
bool last = false;
- bool found = false;
bool flush_snaps = false;
bool complete_capsnap = false;
@@ -3208,14 +3233,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
last ? " LAST" : "");
} else {
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->context == snapc) {
- found = true;
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->context == snapc) {
+ capsnap = iter;
break;
}
}
- if (!found) {
+ if (!capsnap) {
/*
* The capsnap should already be removed when removing
* auth cap in the case of a forced unmount.
@@ -3554,24 +3579,23 @@ static void handle_cap_grant(struct inode *inode,
fill_inline = true;
}
- if (ci->i_auth_cap == cap &&
- le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- if (newcaps & ~extra_info->issued)
- wake = true;
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ if (ci->i_auth_cap == cap) {
+ if (newcaps & ~extra_info->issued)
+ wake = true;
- if (ci->i_requested_max_size > max_size ||
- !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
- /* re-request max_size if necessary */
- ci->i_requested_max_size = 0;
- wake = true;
- }
+ if (ci->i_requested_max_size > max_size ||
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
+ /* re-request max_size if necessary */
+ ci->i_requested_max_size = 0;
+ wake = true;
+ }
- ceph_kick_flushing_inode_caps(session, ci);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_kick_flushing_inode_caps(session, ci);
+ }
up_read(&session->s_mdsc->snap_rwsem);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
+ spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
@@ -3674,7 +3698,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
session->s_mds,
&list_first_entry(&session->s_cap_flushing,
struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ i_flushing_item)->netfs.inode);
}
}
mdsc->num_cap_flushing--;
@@ -3765,8 +3789,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
- struct ceph_cap_snap *capsnap;
- bool flushed = false;
+ struct ceph_cap_snap *capsnap = NULL, *iter;
bool wake_ci = false;
bool wake_mdsc = false;
@@ -3774,26 +3797,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
inode, ci, session->s_mds, follows);
spin_lock(&ci->i_ceph_lock);
- list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- if (capsnap->follows == follows) {
- if (capsnap->cap_flush.tid != flush_tid) {
+ list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
+ if (iter->follows == follows) {
+ if (iter->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
- " %lld\n", capsnap, follows,
- flush_tid, capsnap->cap_flush.tid);
+ " %lld\n", iter, follows,
+ flush_tid, iter->cap_flush.tid);
break;
}
- flushed = true;
+ capsnap = iter;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
- capsnap, capsnap->follows);
+ iter, iter->follows);
}
}
- if (flushed)
+ if (capsnap)
ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
spin_unlock(&ci->i_ceph_lock);
- if (flushed) {
+ if (capsnap) {
ceph_put_snap_context(capsnap->context);
ceph_put_cap_snap(capsnap);
if (wake_ci)
@@ -3870,6 +3893,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
inode, ci, mds, mseq, target);
retry:
+ down_read(&mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
@@ -3933,6 +3957,7 @@ retry:
}
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
/* open target session */
@@ -3958,6 +3983,7 @@ retry:
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
if (tsession) {
mutex_unlock(&tsession->s_mutex);
@@ -4319,7 +4345,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
break;
list_del_init(&ci->i_cap_delay_list);
- inode = igrab(&ci->vfs_inode);
+ inode = igrab(&ci->netfs.inode);
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
@@ -4347,10 +4373,11 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
while (!list_empty(&s->s_cap_dirty)) {
ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
+ ceph_wait_on_async_create(inode);
ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
@@ -4381,7 +4408,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
bool already_opened = false;
int i;
@@ -4415,7 +4442,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
bool is_closed = true;
int i;
@@ -4630,7 +4657,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
lockdep_assert_held(&ci->i_ceph_lock);
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
is_auth = (cap == ci->i_auth_cap);
__ceph_remove_cap(cap, false);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index eae417d71136..e7e2ebac330d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_quota_is_max_files_exceeded(dir)) {
err = -EDQUOT;
goto out;
@@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
- int err = -EROFS;
+ int err;
int op;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
op = CEPH_MDS_OP_MKSNAP;
@@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
+ err = -EROFS;
goto out;
}
@@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct ceph_mds_request *req;
int err;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
@@ -1071,9 +1088,27 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct dentry *dentry = req->r_dentry;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
+ spin_unlock(&dentry->d_lock);
+
+ synchronize_rcu();
+
if (result == -EJUKEBOX)
goto out;
@@ -1081,7 +1116,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
if (result) {
int pathlen = 0;
u64 base = 0;
- char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ char *path = ceph_mdsc_build_path(dentry, &pathlen,
&base, 0);
/* mark error on parent + clear complete */
@@ -1089,13 +1124,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
ceph_dir_clear_complete(req->r_parent);
/* drop the dentry -- we don't know its status */
- if (!d_unhashed(req->r_dentry))
- d_drop(req->r_dentry);
+ if (!d_unhashed(dentry))
+ d_drop(dentry);
/* mark inode itself for an error (since metadata is bogus) */
mapping_set_error(req->r_old_inode->i_mapping, result);
- pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+ pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
}
@@ -1180,6 +1215,8 @@ retry:
if (try_async && op == CEPH_MDS_OP_UNLINK &&
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
dentry->d_name.len, dentry->d_name.name,
ceph_cap_string(req->r_dir_caps));
@@ -1187,6 +1224,16 @@ retry:
req->r_callback = ceph_async_unlink_cb;
req->r_old_inode = d_inode(dentry);
ihold(req->r_old_inode);
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
+ dentry->d_name.hash);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
/*
@@ -1195,10 +1242,20 @@ retry:
*/
drop_nlink(inode);
d_delete(dentry);
- } else if (err == -EJUKEBOX) {
- try_async = false;
- ceph_mdsc_put_request(req);
- goto retry;
+ } else {
+ spin_lock(&fsc->async_unlink_conflict_lock);
+ hash_del_rcu(&di->hnode);
+ spin_unlock(&fsc->async_unlink_conflict_lock);
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+ spin_unlock(&dentry->d_lock);
+
+ if (err == -EJUKEBOX) {
+ try_async = false;
+ ceph_mdsc_put_request(req);
+ goto retry;
+ }
}
} else {
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
@@ -1237,6 +1294,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
(!ceph_quota_is_same_realm(old_dir, new_dir)))
return -EXDEV;
+ err = ceph_wait_on_conflict_unlink(new_dentry);
+ if (err)
+ return err;
+
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6c9e837aa1d3..04fd34557de8 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -95,12 +95,11 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
size_t start;
int idx = 0;
- bytes = iov_iter_get_pages(iter, pages, maxsize - size,
+ bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
ITER_GET_BVECS_PAGES, &start);
if (bytes < 0)
return size ?: bytes;
- iov_iter_advance(iter, bytes);
size += bytes;
for ( ; bytes; idx++, bvec_idx++) {
@@ -205,7 +204,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
struct ceph_file_info *fi;
int ret;
@@ -241,8 +240,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
INIT_LIST_HEAD(&fi->rw_contexts);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
- if ((file->f_mode & FMODE_WRITE) &&
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
ret = ceph_uninline_data(file);
if (ret < 0)
goto error;
@@ -569,7 +567,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
- pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ pr_warn("async create failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
@@ -612,6 +610,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
struct ceph_mds_reply_inode in = { };
struct ceph_mds_reply_info_in iinfo = { .in = &in };
struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
struct inode *inode;
struct timespec64 now;
struct ceph_string *pool_ns;
@@ -629,9 +628,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
- iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
- iinfo.xattr_data = xattr_buf;
- memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
in.ino = cpu_to_le64(vino.ino);
in.snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -651,10 +656,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
/* Directories always inherit the setgid bit. */
if (S_ISDIR(mode))
mode |= S_ISGID;
- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
- !in_group_p(dir->i_gid) &&
- !capable_wrt_inode_uidgid(&init_user_ns, dir, CAP_FSETID))
- mode &= ~S_ISGID;
} else {
in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
}
@@ -708,6 +709,12 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
file->f_mode |= FMODE_CREATED;
ret = finish_open(file, dentry, ceph_open);
}
+
+ spin_lock(&dentry->d_lock);
+ di->flags &= ~CEPH_DENTRY_ASYNC_CREATE;
+ wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT);
+ spin_unlock(&dentry->d_lock);
+
return ret;
}
@@ -734,6 +741,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ err = ceph_wait_on_conflict_unlink(dentry);
+ if (err)
+ return err;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
@@ -743,6 +759,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
} else if (!d_in_lookup(dentry)) {
/* If it's not being looked up, it's negative */
return -ENOENT;
@@ -776,9 +796,16 @@ retry:
(req->r_dir_caps =
try_prep_async_create(dir, dentry, &lo,
&req->r_deleg_ino))) {
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
req->r_callback = ceph_async_create_cb;
+
+ spin_lock(&dentry->d_lock);
+ di->flags |= CEPH_DENTRY_ASYNC_CREATE;
+ spin_unlock(&dentry->d_lock);
+
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
err = ceph_finish_async_create(dir, dentry,
@@ -797,9 +824,7 @@ retry:
}
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
if (err == -ENOENT) {
dentry = ceph_handle_snapdir(req, dentry);
if (IS_ERR(dentry)) {
@@ -955,9 +980,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
false, false);
- ret = ceph_osdc_start_request(osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(osdc, req);
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
ceph_update_read_metrics(&fsc->mdsc->metric,
req->r_start_latency,
@@ -1220,7 +1244,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_inode = inode;
req->r_priv = aio_req;
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
out:
if (ret < 0) {
req->r_result = ret;
@@ -1252,7 +1276,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
- bool should_dirty = !write && iter_is_iovec(iter);
+ bool should_dirty = !write && user_backed_iter(iter);
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -1357,9 +1381,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
continue;
}
- ret = ceph_osdc_start_request(req->r_osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(req->r_osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
if (write)
ceph_update_write_metrics(metric, req->r_start_latency,
@@ -1422,8 +1445,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
r_private_item);
list_del_init(&req->r_private_item);
if (ret >= 0)
- ret = ceph_osdc_start_request(req->r_osdc,
- req, false);
+ ceph_osdc_start_request(req->r_osdc, req);
if (ret < 0) {
req->r_result = ret;
ceph_aio_complete_req(req);
@@ -1536,9 +1558,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
false, true);
req->r_mtime = mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, ret);
@@ -1622,7 +1643,7 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
- if (ci->i_inline_version == CEPH_INLINE_NONE) {
+ if (!ceph_has_inline_data(ci)) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
@@ -1885,7 +1906,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -1925,57 +1946,15 @@ out_unlocked:
*/
static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *inode = file->f_mapping->host;
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- loff_t i_size;
- loff_t ret;
-
- inode_lock(inode);
-
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
+ struct inode *inode = file_inode(file);
+ int ret;
+
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
if (ret < 0)
- goto out;
- }
-
- i_size = i_size_read(inode);
- switch (whence) {
- case SEEK_END:
- offset += i_size;
- break;
- case SEEK_CUR:
- /*
- * Here we special-case the lseek(fd, 0, SEEK_CUR)
- * position-querying operation. Avoid rewriting the "same"
- * f_pos value back to the file because a concurrent read(),
- * write() or lseek() might have altered it
- */
- if (offset == 0) {
- ret = file->f_pos;
- goto out;
- }
- offset += file->f_pos;
- break;
- case SEEK_DATA:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- break;
- case SEEK_HOLE:
- if (offset < 0 || offset >= i_size) {
- ret = -ENXIO;
- goto out;
- }
- offset = i_size;
- break;
+ return ret;
}
-
- ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size));
-
-out:
- inode_unlock(inode);
- return ret;
+ return generic_file_llseek(file, offset, whence);
}
static inline void ceph_zero_partial_page(
@@ -2044,12 +2023,10 @@ static int ceph_zero_partial_object(struct inode *inode,
}
req->r_mtime = inode->i_mtime;
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret) {
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
- if (ret == -ENOENT)
- ret = 0;
- }
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ if (ret == -ENOENT)
+ ret = 0;
ceph_osdc_put_request(req);
out:
@@ -2351,7 +2328,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
if (IS_ERR(req))
ret = PTR_ERR(req);
else {
- ceph_osdc_start_request(osdc, req, false);
+ ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
req->r_start_latency,
@@ -2544,7 +2521,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Let the MDS know about dst file size change */
if (ceph_inode_set_size(dst_inode, dst_off) ||
ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
- ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
+ NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 63113e2a4890..42351d7a0dd6 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -176,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_insert_color(&frag->node, &ci->i_fragtree);
dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
+ ceph_vinop(&ci->netfs.inode), f);
return frag;
}
@@ -457,10 +457,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->vfs_inode);
+ dout("alloc_inode %p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
- netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops);
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
spin_lock_init(&ci->i_ceph_lock);
@@ -547,7 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
- return &ci->vfs_inode;
+ return &ci->netfs.inode;
}
void ceph_free_inode(struct inode *inode)
@@ -578,7 +578,7 @@ void ceph_evict_inode(struct inode *inode)
__ceph_remove_caps(ci);
- if (__ceph_has_any_quota(ci))
+ if (__ceph_has_quota(ci, QUOTA_GET_ANY))
ceph_adjust_quota_realms_count(inode, false);
/*
@@ -1049,7 +1049,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
iinfo->inline_version >= ci->i_inline_version) {
int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
ci->i_inline_version = iinfo->inline_version;
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ if (ceph_has_inline_data(ci) &&
(locked_page || (info_caps & cache_caps)))
fill_inline = true;
}
@@ -1466,10 +1466,12 @@ retry_lookup:
} else if (have_lease) {
if (d_unhashed(dn))
d_add(dn, NULL);
+ }
+
+ if (!d_unhashed(dn) && have_lease)
update_dentry_lease(dir, dn,
rinfo->dlease, session,
req->r_request_started);
- }
goto done;
}
@@ -1884,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- ceph_fscache_invalidate(inode, false);
if (invalidate_inode_pages2(inode->i_mapping) < 0) {
pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
ceph_vinop(inode));
@@ -1977,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work)
{
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
dout("writeback %p\n", inode);
@@ -2258,6 +2259,36 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
+int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
+{
+ int issued = ceph_caps_issued(ceph_inode(inode));
+
+ /*
+ * If any 'x' caps is issued we can just choose the auth MDS
+ * instead of the random replica MDSes. Because only when the
+ * Locker is in LOCK_EXEC state will the loner client could
+ * get the 'x' caps. And if we send the getattr requests to
+ * any replica MDS it must auth pin and tries to rdlock from
+ * the auth MDS, and then the auth MDS need to do the Locker
+ * state transition to LOCK_SYNC. And after that the lock state
+ * will change back.
+ *
+ * This cost much when doing the Locker state transition and
+ * usually will need to revoke caps from clients.
+ *
+ * And for the 'Xs' caps for getxattr we will also choose the
+ * auth MDS, because the MDS side code is buggy due to setxattr
+ * won't notify the replica MDSes when the values changed and
+ * the replica MDS will return the old values. Though we will
+ * fix it in MDS code, but this still makes sense for old ceph.
+ */
+ if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
+ || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
+ return USE_AUTH_MDS;
+ else
+ return USE_ANY_MDS;
+}
+
/*
* Verify that we have a lease on the given mask. If not,
* do a getattr against an mds.
@@ -2281,7 +2312,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
return 0;
- mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+ mode = ceph_try_to_choose_auth_mds(inode, mask);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -2296,7 +2327,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
if (inline_version == 0) {
/* the reply is supposed to contain inline data */
err = -EINVAL;
- } else if (inline_version == CEPH_INLINE_NONE) {
+ } else if (inline_version == CEPH_INLINE_NONE ||
+ inline_version == 1) {
err = -ENODATA;
} else {
err = req->r_reply_info.targeti.inline_len;
@@ -2423,7 +2455,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
return -ESTALE;
/* Skip the getattr altogether if we're asked not to sync */
- if (!(flags & AT_STATX_DONT_SYNC)) {
+ if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
err = ceph_do_getattr(inode,
statx_to_caps(request_mask, inode->i_mode),
flags & AT_STATX_FORCE_SYNC);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa38c013126d..80f8b9ec1a31 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end,
ceph_decode_32_safe(p, end, sets, bad);
dout("got %u sets of delegated inodes\n", sets);
while (sets--) {
- u64 start, len, ino;
+ u64 start, len;
ceph_decode_64_safe(p, end, start, bad);
ceph_decode_64_safe(p, end, len, bad);
@@ -449,14 +449,14 @@ static int ceph_parse_deleg_inos(void **p, void *end,
continue;
}
while (len--) {
- int err = xa_insert(&s->s_delegated_inos, ino = start++,
+ int err = xa_insert(&s->s_delegated_inos, start++,
DELEGATED_INO_AVAILABLE,
GFP_KERNEL);
if (!err) {
dout("added delegated inode 0x%llx\n",
start - 1);
} else if (err == -EBUSY) {
- pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+ pr_warn("MDS delegated inode 0x%llx more than once.\n",
start - 1);
} else {
return err;
@@ -655,6 +655,79 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
}
+/*
+ * In async unlink case the kclient won't wait for the first reply
+ * from MDS and just drop all the links and unhash the dentry and then
+ * succeeds immediately.
+ *
+ * For any new create/link/rename,etc requests followed by using the
+ * same file names we must wait for the first reply of the inflight
+ * unlink request, or the MDS possibly will fail these following
+ * requests with -EEXIST if the inflight async unlink request was
+ * delayed for some reasons.
+ *
+ * And the worst case is that for the none async openc request it will
+ * successfully open the file if the CDentry hasn't been unlinked yet,
+ * but later the previous delayed async unlink request will remove the
+ * CDenty. That means the just created file is possiblly deleted later
+ * by accident.
+ *
+ * We need to wait for the inflight async unlink requests to finish
+ * when creating new files/directories by using the same file names.
+ */
+int ceph_wait_on_conflict_unlink(struct dentry *dentry)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct dentry *pdentry = dentry->d_parent;
+ struct dentry *udentry, *found = NULL;
+ struct ceph_dentry_info *di;
+ struct qstr dname;
+ u32 hash = dentry->d_name.hash;
+ int err;
+
+ dname.name = dentry->d_name.name;
+ dname.len = dentry->d_name.len;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
+ hnode, hash) {
+ udentry = di->dentry;
+
+ spin_lock(&udentry->d_lock);
+ if (udentry->d_name.hash != hash)
+ goto next;
+ if (unlikely(udentry->d_parent != pdentry))
+ goto next;
+ if (!hash_hashed(&di->hnode))
+ goto next;
+
+ if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+ pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
+ __func__, dentry, dentry);
+
+ if (!d_same_name(udentry, pdentry, &dname))
+ goto next;
+
+ spin_unlock(&udentry->d_lock);
+ found = dget(udentry);
+ break;
+next:
+ spin_unlock(&udentry->d_lock);
+ }
+ rcu_read_unlock();
+
+ if (likely(!found))
+ return 0;
+
+ dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
+ dentry, dentry, found, found);
+
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
+ TASK_KILLABLE);
+ dput(found);
+ return err;
+}
+
/*
* sessions
@@ -1220,14 +1293,17 @@ static int encode_supported_features(void **p, void *end)
if (count > 0) {
size_t i;
size_t size = FEATURE_BYTES(count);
+ unsigned long bit;
if (WARN_ON_ONCE(*p + 4 + size > end))
return -ERANGE;
ceph_encode_32(p, size);
memset(*p, 0, size);
- for (i = 0; i < count; i++)
- ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
+ for (i = 0; i < count; i++) {
+ bit = feature_bits[i];
+ ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
+ }
*p += size;
} else {
if (WARN_ON_ONCE(*p + 4 > end))
@@ -1564,7 +1640,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
p = session->s_caps.next;
while (p != &session->s_caps) {
cap = list_entry(p, struct ceph_cap, session_caps);
- inode = igrab(&cap->ci->vfs_inode);
+ inode = igrab(&cap->ci->netfs.inode);
if (!inode) {
p = p->next;
continue;
@@ -1622,7 +1698,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
int iputs;
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
spin_lock(&ci->i_ceph_lock);
iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
spin_unlock(&ci->i_ceph_lock);
@@ -2651,7 +2727,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_mds_request_head_old *rhead;
struct ceph_msg *msg;
- int flags = 0;
+ int flags = 0, max_retry;
+
+ /*
+ * The type of 'r_attempts' in kernel 'ceph_mds_request'
+ * is 'int', while in 'ceph_mds_request_head' the type of
+ * 'num_retry' is '__u8'. So in case the request retries
+ * exceeding 256 times, the MDS will receive a incorrect
+ * retry seq.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * retrying the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
+ max_retry = 1 << (max_retry * BITS_PER_BYTE);
+ if (req->r_attempts >= max_retry) {
+ pr_warn_ratelimited("%s request tid %llu seq overflow\n",
+ __func__, req->r_tid);
+ return -EMULTIHOP;
+ }
req->r_attempts++;
if (req->r_inode) {
@@ -2663,7 +2760,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
else
req->r_sent_on_mseq = -1;
}
- dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+ dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
@@ -2863,6 +2960,64 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
+ /*
+ * For async create we will choose the auth MDS of frag in parent
+ * directory to send the request and ususally this works fine, but
+ * if the migrated the dirtory to another MDS before it could handle
+ * it the request will be forwarded.
+ *
+ * And then the auth cap will be changed.
+ */
+ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
+ struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+
+ /*
+ * The request maybe handled very fast and the new inode
+ * hasn't been linked to the dentry yet. We need to wait
+ * for the ceph_finish_async_create(), which shouldn't be
+ * stuck too long or fail in thoery, to finish when forwarding
+ * the request.
+ */
+ if (!d_inode(req->r_dentry)) {
+ err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
+ TASK_KILLABLE);
+ if (err) {
+ mutex_lock(&req->r_fill_mutex);
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ goto out_session;
+ }
+ }
+
+ ci = ceph_inode(d_inode(req->r_dentry));
+
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
+ dout("do_request session changed for auth cap %d -> %d\n",
+ cap->session->s_mds, session->s_mds);
+
+ /* Remove the auth cap from old session */
+ spin_lock(&cap->session->s_cap_lock);
+ cap->session->s_nr_caps--;
+ list_del_init(&cap->session_caps);
+ spin_unlock(&cap->session->s_cap_lock);
+
+ /* Add the auth cap to the new session */
+ cap->mds = mds;
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ session->s_nr_caps++;
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ spin_unlock(&session->s_cap_lock);
+
+ change_auth_cap_ses(ci, session);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ }
+
err = __send_request(session, req, false);
out_session:
@@ -3265,6 +3420,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
int err = -EINVAL;
void *p = msg->front.iov_base;
void *end = p + msg->front.iov_len;
+ bool aborted = false;
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
next_mds = ceph_decode_32(&p);
@@ -3273,16 +3429,41 @@ static void handle_forward(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex);
req = lookup_get_request(mdsc, tid);
if (!req) {
+ mutex_unlock(&mdsc->mutex);
dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
- goto out; /* dup reply? */
+ return; /* dup reply? */
}
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
} else if (fwd_seq <= req->r_num_fwd) {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
+ /*
+ * The type of 'num_fwd' in ceph 'MClientRequestForward'
+ * is 'int32_t', while in 'ceph_mds_request_head' the
+ * type is '__u8'. So in case the request bounces between
+ * MDSes exceeding 256 times, the client will get stuck.
+ *
+ * In this case it's ususally a bug in MDS and continue
+ * bouncing the request makes no sense.
+ *
+ * In future this could be fixed in ceph code, so avoid
+ * using the hardcode here.
+ */
+ int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
+ max = 1 << (max * BITS_PER_BYTE);
+ if (req->r_num_fwd >= max) {
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EMULTIHOP;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ pr_warn_ratelimited("forward tid %llu seq overflow\n",
+ tid);
+ } else {
+ dout("forward tid %llu to mds%d - old seq %d <= %d\n",
+ tid, next_mds, req->r_num_fwd, fwd_seq);
+ }
} else {
/* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
@@ -3294,9 +3475,12 @@ static void handle_forward(struct ceph_mds_client *mdsc,
put_request_session(req);
__do_request(mdsc, req);
}
- ceph_mdsc_put_request(req);
-out:
mutex_unlock(&mdsc->mutex);
+
+ /* kick calling process */
+ if (aborted)
+ complete_request(mdsc, req);
+ ceph_mdsc_put_request(req);
return;
bad:
@@ -3375,13 +3559,17 @@ static void handle_session(struct ceph_mds_session *session,
}
if (msg_version >= 5) {
- u32 flags;
- /* version >= 4, struct_v, struct_cv, len, metric_spec */
- ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad);
+ u32 flags, len;
+
+ /* version >= 4 */
+ ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
+ ceph_decode_32_safe(&p, end, len, bad); /* len */
+ ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
+
/* version >= 5, flags */
- ceph_decode_32_safe(&p, end, flags, bad);
+ ceph_decode_32_safe(&p, end, flags, bad);
if (flags & CEPH_SESSION_BLOCKLISTED) {
- pr_warn("mds%d session blocklisted\n", session->s_mds);
+ pr_warn("mds%d session blocklisted\n", session->s_mds);
blocklisted = true;
}
}
@@ -3410,11 +3598,26 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_OPEN:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect success\n", session->s_mds);
- session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
- renewed_caps(mdsc, session, 0);
- if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features))
- metric_schedule_delayed(&mdsc->metric);
+
+ if (session->s_state == CEPH_MDS_SESSION_OPEN) {
+ pr_notice("mds%d is already opened\n", session->s_mds);
+ } else {
+ session->s_state = CEPH_MDS_SESSION_OPEN;
+ session->s_features = features;
+ renewed_caps(mdsc, session, 0);
+ if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
+ &session->s_features))
+ metric_schedule_delayed(&mdsc->metric);
+ }
+
+ /*
+ * The connection maybe broken and the session in client
+ * side has been reinitialized, need to update the seq
+ * anyway.
+ */
+ if (!session->s_seq && seq)
+ session->s_seq = seq;
+
wake = 1;
if (mdsc->stopping)
__close_session(mdsc, session);
@@ -4396,12 +4599,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
memcpy((void *)(lease + 1) + 4,
dentry->d_name.name, dentry->d_name.len);
spin_unlock(&dentry->d_lock);
- /*
- * if this is a preemptive lease RELEASE, no need to
- * flush request stream, since the actual request will
- * soon follow.
- */
- msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
ceph_con_send(&session->s_con, msg);
}
@@ -4434,8 +4631,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
bool check_session_state(struct ceph_mds_session *s)
{
- struct ceph_fs_client *fsc = s->s_mdsc->fsc;
-
switch (s->s_state) {
case CEPH_MDS_SESSION_OPEN:
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
@@ -4444,10 +4639,6 @@ bool check_session_state(struct ceph_mds_session *s)
}
break;
case CEPH_MDS_SESSION_CLOSING:
- /* Should never reach this when not force unmounting */
- WARN_ON_ONCE(s->s_ttl &&
- READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
- fallthrough;
case CEPH_MDS_SESSION_NEW:
case CEPH_MDS_SESSION_RESTARTING:
case CEPH_MDS_SESSION_CLOSED:
@@ -4702,15 +4893,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
}
/*
- * wait for all write mds requests to flush.
+ * flush the mdlog and wait for all write mds requests to flush.
*/
-static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
+ u64 want_tid)
{
struct ceph_mds_request *req = NULL, *nextreq;
+ struct ceph_mds_session *last_session = NULL;
struct rb_node *n;
mutex_lock(&mdsc->mutex);
- dout("wait_unsafe_requests want %lld\n", want_tid);
+ dout("%s want %lld\n", __func__, want_tid);
restart:
req = __get_oldest_req(mdsc);
while (req && req->r_tid <= want_tid) {
@@ -4722,14 +4915,32 @@ restart:
nextreq = NULL;
if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
(req->r_op & CEPH_MDS_OP_WRITE)) {
+ struct ceph_mds_session *s = req->r_session;
+
+ if (!s) {
+ req = nextreq;
+ continue;
+ }
+
/* write op */
ceph_mdsc_get_request(req);
if (nextreq)
ceph_mdsc_get_request(nextreq);
+ s = ceph_get_mds_session(s);
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests wait on %llu (want %llu)\n",
+
+ /* send flush mdlog request to MDS */
+ if (last_session != s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(last_session);
+ last_session = s;
+ } else {
+ ceph_put_mds_session(s);
+ }
+ dout("%s wait on %llu (want %llu)\n", __func__,
req->r_tid, want_tid);
wait_for_completion(&req->r_safe_completion);
+
mutex_lock(&mdsc->mutex);
ceph_mdsc_put_request(req);
if (!nextreq)
@@ -4744,7 +4955,8 @@ restart:
req = nextreq;
}
mutex_unlock(&mdsc->mutex);
- dout("wait_unsafe_requests done\n");
+ ceph_put_mds_session(last_session);
+ dout("%s done\n", __func__);
}
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
@@ -4773,7 +4985,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync want tid %lld flush_seq %lld\n",
want_tid, want_flush);
- wait_unsafe_requests(mdsc, want_tid);
+ flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
wait_caps_flush(mdsc, want_flush);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 33497846e47e..256e3eada6c1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -29,14 +29,12 @@ enum ceph_feature_type {
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_DELEG_INO,
CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_ALTERNATE_NAME,
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
};
-/*
- * This will always have the highest feature bit value
- * as the last element of the array.
- */
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
@@ -45,10 +43,8 @@ enum ceph_feature_type {
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
CEPHFS_FEATURE_METRIC_COLLECT, \
- \
- CEPHFS_FEATURE_MAX, \
+ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
}
-#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
/*
* Some lock dependencies:
@@ -579,9 +575,10 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
- TASK_INTERRUPTIBLE);
+ TASK_KILLABLE);
}
+extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 30387733765d..8d0a6d2c2da4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -352,12 +352,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
__decode_and_drop_type(p, end, u8, bad_ext);
}
if (mdsmap_ev >= 8) {
- u32 name_len;
/* enabled */
ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
- ceph_decode_32_safe(p, end, name_len, bad_ext);
- ceph_decode_need(p, end, name_len, bad_ext);
- *p += name_len;
+ /* fs_name */
+ ceph_decode_skip_string(p, end, bad_ext);
}
/* damaged */
if (mdsmap_ev >= 9) {
@@ -370,6 +368,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
} else {
m->m_damaged = false;
}
+ if (mdsmap_ev >= 17) {
+ /* balancer */
+ ceph_decode_skip_string(p, end, bad_ext);
+ /* standby_count_wanted */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* old_max_mds */
+ ceph_decode_skip_32(p, end, bad_ext);
+ /* min_compat_client */
+ ceph_decode_skip_8(p, end, bad_ext);
+ /* required_client_features */
+ ceph_decode_skip_set(p, end, 64, bad_ext);
+ ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
+ } else {
+ /* This forces the usage of the (sync) SETXATTR Op */
+ m->m_max_xattr_size = 0;
+ }
bad_ext:
dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
!!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index a338a3ec0dc4..64592adfe48f 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -195,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
/*
* This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
- * or max_bytes). If the root is reached, return the root ceph_snap_realm
- * instead.
+ * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * max_bytes, or any, depending on the 'which_quota' argument). If the root is
+ * reached, return the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -209,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* will be restarted.
*/
static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode, bool retry)
+ struct inode *inode,
+ enum quota_get_realm which_quota,
+ bool retry)
{
struct ceph_inode_info *ci = NULL;
struct ceph_snap_realm *realm, *next;
@@ -248,7 +250,7 @@ restart:
}
ci = ceph_inode(in);
- has_quota = __ceph_has_any_quota(ci);
+ has_quota = __ceph_has_quota(ci, which_quota);
iput(in);
next = realm->parent;
@@ -279,8 +281,8 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, true);
- new_realm = get_quota_realm(mdsc, new, false);
+ old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
+ new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
if (PTR_ERR(new_realm) == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
@@ -483,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
+ realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
+ QUOTA_GET_MAX_BYTES, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 322ee5add942..864cdaa0d2bd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -521,7 +521,7 @@ static bool has_new_snaps(struct ceph_snap_context *o,
static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap **pcapsnap)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_snap_context *old_snapc, *new_snapc;
struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
@@ -652,7 +652,7 @@ update_snapc:
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
@@ -712,7 +712,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
+ struct inode *inode = igrab(&ci->netfs.inode);
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
@@ -904,7 +904,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
ceph_flush_snaps(ci, &session);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e6987d295079..3fc48b43cab0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -72,15 +72,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
/*
- * express utilization in terms of large blocks to avoid
+ * Express utilization in terms of large blocks to avoid
* overflow on 32-bit machines.
- *
- * NOTE: for the time being, we make bsize == frsize to humor
- * not-yet-ancient versions of glibc that are broken.
- * Someday, we will probably want to report a real block
- * size... whatever that may mean for a network file system!
*/
- buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
/*
@@ -95,6 +89,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
}
+ /*
+ * NOTE: for the time being, we make bsize == frsize to humor
+ * not-yet-ancient versions of glibc that are broken.
+ * Someday, we will probably want to report a real block
+ * size... whatever that may mean for a network file system!
+ */
+ buf->f_bsize = buf->f_frsize;
+
buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1;
buf->f_namelen = NAME_MAX;
@@ -816,6 +818,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->cap_wq)
goto fail_inode_wq;
+ hash_init(fsc->async_unlink_conflict);
+ spin_lock_init(&fsc->async_unlink_conflict_lock);
+
spin_lock(&ceph_fsc_lock);
list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
spin_unlock(&ceph_fsc_lock);
@@ -876,7 +881,7 @@ mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
+ inode_init_once(&ci->netfs.inode);
}
static int __init init_caches(void)
@@ -1119,6 +1124,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
s->s_time_gran = 1;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
+ s->s_flags |= SB_NODIRATIME | SB_NOATIME;
ret = set_anon_super_fc(s, fc);
if (ret != 0)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 20ceab74e871..40630e6f691c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -19,6 +19,7 @@
#include <linux/security.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
+#include <linux/hashtable.h>
#include <linux/ceph/libceph.h>
@@ -99,6 +100,8 @@ struct ceph_mount_options {
char *mon_addr;
};
+#define CEPH_ASYNC_CREATE_CONFLICT_BITS 8
+
struct ceph_fs_client {
struct super_block *sb;
@@ -124,6 +127,9 @@ struct ceph_fs_client {
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
+ DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
+ spinlock_t async_unlink_conflict_lock;
+
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
@@ -280,7 +286,8 @@ struct ceph_dentry_info {
struct dentry *dentry;
struct ceph_mds_session *lease_session;
struct list_head lease_list;
- unsigned flags;
+ struct hlist_node hnode;
+ unsigned long flags;
int lease_shared_gen;
u32 lease_gen;
u32 lease_seq;
@@ -289,10 +296,14 @@ struct ceph_dentry_info {
u64 offset;
};
-#define CEPH_DENTRY_REFERENCED 1
-#define CEPH_DENTRY_LEASE_LIST 2
-#define CEPH_DENTRY_SHRINK_LIST 4
-#define CEPH_DENTRY_PRIMARY_LINK 8
+#define CEPH_DENTRY_REFERENCED (1 << 0)
+#define CEPH_DENTRY_LEASE_LIST (1 << 1)
+#define CEPH_DENTRY_SHRINK_LIST (1 << 2)
+#define CEPH_DENTRY_PRIMARY_LINK (1 << 3)
+#define CEPH_DENTRY_ASYNC_UNLINK_BIT (4)
+#define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
+#define CEPH_DENTRY_ASYNC_CREATE_BIT (5)
+#define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT)
struct ceph_inode_xattrs_info {
/*
@@ -316,11 +327,7 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode;
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
@@ -436,7 +443,7 @@ struct ceph_inode_info {
static inline struct ceph_inode_info *
ceph_inode(const struct inode *inode)
{
- return container_of(inode, struct ceph_inode_info, vfs_inode);
+ return container_of(inode, struct ceph_inode_info, netfs.inode);
}
static inline struct ceph_fs_client *
@@ -762,6 +769,8 @@ extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
+extern void change_auth_cap_ses(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session);
@@ -1022,6 +1031,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode)
ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
}
+extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask);
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force);
static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
@@ -1221,6 +1231,14 @@ extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
+static inline bool ceph_has_inline_data(struct ceph_inode_info *ci)
+{
+ if (ci->i_inline_version == CEPH_INLINE_NONE ||
+ ci->i_inline_version == 1) /* initial version, no data */
+ return false;
+ return true;
+}
+
/* file.c */
extern const struct file_operations ceph_file_fops;
@@ -1278,9 +1296,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client);
extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
/* quota.c */
-static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci)
+
+enum quota_get_realm {
+ QUOTA_GET_MAX_FILES,
+ QUOTA_GET_MAX_BYTES,
+ QUOTA_GET_ANY
+};
+
+static inline bool __ceph_has_quota(struct ceph_inode_info *ci,
+ enum quota_get_realm which)
{
- return ci->i_max_files || ci->i_max_bytes;
+ bool has_quota = false;
+
+ switch (which) {
+ case QUOTA_GET_MAX_BYTES:
+ has_quota = !!ci->i_max_bytes;
+ break;
+ case QUOTA_GET_MAX_FILES:
+ has_quota = !!ci->i_max_files;
+ break;
+ default:
+ has_quota = !!(ci->i_max_files || ci->i_max_bytes);
+ }
+ return has_quota;
}
extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc);
@@ -1289,13 +1327,13 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
u64 max_bytes, u64 max_files)
{
bool had_quota, has_quota;
- had_quota = __ceph_has_any_quota(ci);
+ had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
ci->i_max_bytes = max_bytes;
ci->i_max_files = max_files;
- has_quota = __ceph_has_any_quota(ci);
+ has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
if (had_quota != has_quota)
- ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+ ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index afec84088471..f31350cda960 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_string *pool_ns;
s64 pool = ci->i_layout.pool_id;
@@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
- dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
@@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
ssize_t ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
const char *pool_name;
@@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
}
@@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "client%lld",
ceph_client_gid(fsc->client));
@@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
}
#define XATTR_RSTAT_FIELD(_type, _name) \
XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
+ }
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
@@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_RSTAT_FIELD(dir, rsubdirs),
XATTR_RSTAT_FIELD(dir, rsnaps),
XATTR_RSTAT_FIELD(dir, rbytes),
- XATTR_RSTAT_FIELD(dir, rctime),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
{
.name = "ceph.dir.pin",
.name_size = sizeof("ceph.dir.pin"),
@@ -621,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
}
dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
+ ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);
return 0;
}
@@ -863,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
@@ -1078,7 +1086,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
flags |= CEPH_XATTR_REMOVE;
}
- dout("setxattr value=%.*s\n", (int)size, value);
+ dout("setxattr value size: %zu\n", size);
/* do request */
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1176,8 +1184,14 @@ int __ceph_setxattr(struct inode *inode, const char *name,
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
- if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
+ required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+ if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) ||
+ (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) {
+ dout("%s do sync setxattr: version: %llu size: %d max: %llu\n",
+ __func__, ci->i_xattrs.version, required_blob_size,
+ mdsc->mdsmap->m_max_xattr_size);
goto do_sync;
+ }
if (!lock_snap_rwsem && !ci->i_head_snapc) {
lock_snap_rwsem = true;
@@ -1193,8 +1207,6 @@ retry:
ceph_cap_string(issued));
__build_xattrs(inode);
- required_blob_size = __get_required_blob_size(ci, name_len, val_len);
-
if (!ci->i_xattrs.prealloc_blob ||
required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
struct ceph_buffer *blob;
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index cc8fdcb35b71..7c9785973f49 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,10 +5,10 @@
ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_CIFS) += cifs.o
-cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
+cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \
inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \
- cifs_unicode.o nterr.o cifsencrypt.o \
- readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \
+ cached_dir.o cifs_unicode.o nterr.o cifsencrypt.o \
+ readdir.o ioctl.o sess.o export.o unc.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o
@@ -30,3 +30,5 @@ cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o
cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o
+
+cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o cifssmb.o
diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c
new file mode 100644
index 000000000000..b401339f6e73
--- /dev/null
+++ b/fs/cifs/cached_dir.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Functions to handle the cached directory entries
+ *
+ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com>
+ */
+
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "smb2proto.h"
+#include "cached_dir.h"
+
+/*
+ * Open the and cache a directory handle.
+ * If error then *cfid is not initialized.
+ */
+int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
+ const char *path,
+ struct cifs_sb_info *cifs_sb,
+ bool lookup_only, struct cached_fid **ret_cfid)
+{
+ struct cifs_ses *ses;
+ struct TCP_Server_Info *server;
+ struct cifs_open_parms oparms;
+ struct smb2_create_rsp *o_rsp = NULL;
+ struct smb2_query_info_rsp *qi_rsp = NULL;
+ int resp_buftype[2];
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov[1];
+ int rc, flags = 0;
+ __le16 utf16_path = 0; /* Null - since an open of top of share */
+ u8 oplock = SMB2_OPLOCK_LEVEL_II;
+ struct cifs_fid *pfid;
+ struct dentry *dentry;
+ struct cached_fid *cfid;
+
+ if (tcon == NULL || tcon->nohandlecache ||
+ is_smb1_server(tcon->ses->server))
+ return -EOPNOTSUPP;
+
+ ses = tcon->ses;
+ server = ses->server;
+
+ if (cifs_sb->root == NULL)
+ return -ENOENT;
+
+ if (strlen(path))
+ return -ENOENT;
+
+ dentry = cifs_sb->root;
+
+ cfid = tcon->cfid;
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->is_valid) {
+ cifs_dbg(FYI, "found a cached root file handle\n");
+ *ret_cfid = cfid;
+ kref_get(&cfid->refcount);
+ mutex_unlock(&cfid->fid_mutex);
+ return 0;
+ }
+
+ /*
+ * We do not hold the lock for the open because in case
+ * SMB2_open needs to reconnect, it will end up calling
+ * cifs_mark_open_files_invalid() which takes the lock again
+ * thus causing a deadlock
+ */
+ mutex_unlock(&cfid->fid_mutex);
+
+ if (lookup_only)
+ return -ENOENT;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
+ pfid = &cfid->fid;
+ server->ops->new_lease_key(pfid);
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
+ oparms.tcon = tcon;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE);
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.fid = pfid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open_init(tcon, server,
+ &rqst[0], &oplock, &oparms, &utf16_path);
+ if (rc)
+ goto oshr_free;
+ smb2_set_next_command(tcon, &rqst[0]);
+
+ memset(&qi_iov, 0, sizeof(qi_iov));
+ rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_nvec = 1;
+
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[1], COMPOUND_FID,
+ COMPOUND_FID, FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (rc)
+ goto oshr_free;
+
+ smb2_set_related(&rqst[1]);
+
+ rc = compound_send_recv(xid, ses, server,
+ flags, 2, rqst,
+ resp_buftype, rsp_iov);
+ mutex_lock(&cfid->fid_mutex);
+
+ /*
+ * Now we need to check again as the cached root might have
+ * been successfully re-opened from a concurrent process
+ */
+
+ if (cfid->is_valid) {
+ /* work was already done */
+
+ /* stash fids for close() later */
+ struct cifs_fid fid = {
+ .persistent_fid = pfid->persistent_fid,
+ .volatile_fid = pfid->volatile_fid,
+ };
+
+ /*
+ * caller expects this func to set the fid in cfid to valid
+ * cached root, so increment the refcount.
+ */
+ kref_get(&cfid->refcount);
+
+ mutex_unlock(&cfid->fid_mutex);
+
+ if (rc == 0) {
+ /* close extra handle outside of crit sec */
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ }
+ rc = 0;
+ goto oshr_free;
+ }
+
+ /* Cached root is still invalid, continue normaly */
+
+ if (rc) {
+ if (rc == -EREMCHG) {
+ tcon->need_reconnect = true;
+ pr_warn_once("server share %s deleted\n",
+ tcon->treeName);
+ }
+ goto oshr_exit;
+ }
+
+ atomic_inc(&tcon->num_remote_opens);
+
+ o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+ oparms.fid->persistent_fid = o_rsp->PersistentFileId;
+ oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+#ifdef CONFIG_CIFS_DEBUG2
+ oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
+#endif /* CIFS_DEBUG2 */
+
+ cfid->tcon = tcon;
+ cfid->is_valid = true;
+ cfid->dentry = dentry;
+ dget(dentry);
+ kref_init(&cfid->refcount);
+
+ /* BB TBD check to see if oplock level check can be removed below */
+ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
+ /*
+ * See commit 2f94a3125b87. Increment the refcount when we
+ * get a lease for root, release it if lease break occurs
+ */
+ kref_get(&cfid->refcount);
+ cfid->has_lease = true;
+ smb2_parse_contexts(server, o_rsp,
+ &oparms.fid->epoch,
+ oparms.fid->lease_key, &oplock,
+ NULL, NULL);
+ } else
+ goto oshr_exit;
+
+ qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
+ goto oshr_exit;
+ if (!smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ sizeof(struct smb2_file_all_info),
+ &rsp_iov[1], sizeof(struct smb2_file_all_info),
+ (char *)&cfid->file_all_info))
+ cfid->file_all_info_is_valid = true;
+
+ cfid->time = jiffies;
+
+oshr_exit:
+ mutex_unlock(&cfid->fid_mutex);
+oshr_free:
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_info_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ if (rc == 0)
+ *ret_cfid = cfid;
+
+ return rc;
+}
+
+int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry,
+ struct cached_fid **ret_cfid)
+{
+ struct cached_fid *cfid;
+
+ cfid = tcon->cfid;
+
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->dentry == dentry) {
+ cifs_dbg(FYI, "found a cached root file handle by dentry\n");
+ *ret_cfid = cfid;
+ kref_get(&cfid->refcount);
+ mutex_unlock(&cfid->fid_mutex);
+ return 0;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ return -ENOENT;
+}
+
+static void
+smb2_close_cached_fid(struct kref *ref)
+{
+ struct cached_fid *cfid = container_of(ref, struct cached_fid,
+ refcount);
+ struct cached_dirent *dirent, *q;
+
+ if (cfid->is_valid) {
+ cifs_dbg(FYI, "clear cached root file handle\n");
+ SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid);
+ }
+
+ /*
+ * We only check validity above to send SMB2_close,
+ * but we still need to invalidate these entries
+ * when this function is called
+ */
+ cfid->is_valid = false;
+ cfid->file_all_info_is_valid = false;
+ cfid->has_lease = false;
+ if (cfid->dentry) {
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
+ /*
+ * Delete all cached dirent names
+ */
+ mutex_lock(&cfid->dirents.de_mutex);
+ list_for_each_entry_safe(dirent, q, &cfid->dirents.entries, entry) {
+ list_del(&dirent->entry);
+ kfree(dirent->name);
+ kfree(dirent);
+ }
+ cfid->dirents.is_valid = 0;
+ cfid->dirents.is_failed = 0;
+ cfid->dirents.ctx = NULL;
+ cfid->dirents.pos = 0;
+ mutex_unlock(&cfid->dirents.de_mutex);
+
+}
+
+void close_cached_dir(struct cached_fid *cfid)
+{
+ mutex_lock(&cfid->fid_mutex);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ mutex_unlock(&cfid->fid_mutex);
+}
+
+void close_cached_dir_lease_locked(struct cached_fid *cfid)
+{
+ if (cfid->has_lease) {
+ cfid->has_lease = false;
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
+}
+
+void close_cached_dir_lease(struct cached_fid *cfid)
+{
+ mutex_lock(&cfid->fid_mutex);
+ close_cached_dir_lease_locked(cfid);
+ mutex_unlock(&cfid->fid_mutex);
+}
+
+/*
+ * Called from cifs_kill_sb when we unmount a share
+ */
+void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
+{
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node;
+ struct cached_fid *cfid;
+ struct cifs_tcon *tcon;
+ struct tcon_link *tlink;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+ tcon = tlink_tcon(tlink);
+ if (IS_ERR(tcon))
+ continue;
+ cfid = tcon->cfid;
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->dentry) {
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ }
+}
+
+/*
+ * Invalidate and close all cached dirs when a TCON has been reset
+ * due to a session loss.
+ */
+void invalidate_all_cached_dirs(struct cifs_tcon *tcon)
+{
+ mutex_lock(&tcon->cfid->fid_mutex);
+ tcon->cfid->is_valid = false;
+ /* cached handle is not valid, so SMB2_CLOSE won't be sent below */
+ close_cached_dir_lease_locked(tcon->cfid);
+ memset(&tcon->cfid->fid, 0, sizeof(struct cifs_fid));
+ mutex_unlock(&tcon->cfid->fid_mutex);
+}
+
+static void
+smb2_cached_lease_break(struct work_struct *work)
+{
+ struct cached_fid *cfid = container_of(work,
+ struct cached_fid, lease_break);
+
+ close_cached_dir_lease(cfid);
+}
+
+int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
+{
+ if (tcon->cfid->is_valid &&
+ !memcmp(lease_key,
+ tcon->cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE)) {
+ tcon->cfid->time = 0;
+ INIT_WORK(&tcon->cfid->lease_break,
+ smb2_cached_lease_break);
+ queue_work(cifsiod_wq,
+ &tcon->cfid->lease_break);
+ return true;
+ }
+ return false;
+}
+
+struct cached_fid *init_cached_dir(void)
+{
+ struct cached_fid *cfid;
+
+ cfid = kzalloc(sizeof(*cfid), GFP_KERNEL);
+ if (!cfid)
+ return NULL;
+ INIT_LIST_HEAD(&cfid->dirents.entries);
+ mutex_init(&cfid->dirents.de_mutex);
+ mutex_init(&cfid->fid_mutex);
+ return cfid;
+}
+
+void free_cached_dir(struct cifs_tcon *tcon)
+{
+ kfree(tcon->cfid);
+}
diff --git a/fs/cifs/cached_dir.h b/fs/cifs/cached_dir.h
new file mode 100644
index 000000000000..bd262dc8b179
--- /dev/null
+++ b/fs/cifs/cached_dir.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Functions to handle the cached directory entries
+ *
+ * Copyright (c) 2022, Ronnie Sahlberg <lsahlber@redhat.com>
+ */
+
+#ifndef _CACHED_DIR_H
+#define _CACHED_DIR_H
+
+
+struct cached_dirent {
+ struct list_head entry;
+ char *name;
+ int namelen;
+ loff_t pos;
+
+ struct cifs_fattr fattr;
+};
+
+struct cached_dirents {
+ bool is_valid:1;
+ bool is_failed:1;
+ struct dir_context *ctx; /*
+ * Only used to make sure we only take entries
+ * from a single context. Never dereferenced.
+ */
+ struct mutex de_mutex;
+ int pos; /* Expected ctx->pos */
+ struct list_head entries;
+};
+
+struct cached_fid {
+ bool is_valid:1; /* Do we have a useable root fid */
+ bool file_all_info_is_valid:1;
+ bool has_lease:1;
+ unsigned long time; /* jiffies of when lease was taken */
+ struct kref refcount;
+ struct cifs_fid fid;
+ struct mutex fid_mutex;
+ struct cifs_tcon *tcon;
+ struct dentry *dentry;
+ struct work_struct lease_break;
+ struct smb2_file_all_info file_all_info;
+ struct cached_dirents dirents;
+};
+
+extern struct cached_fid *init_cached_dir(void);
+extern void free_cached_dir(struct cifs_tcon *tcon);
+extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
+ const char *path,
+ struct cifs_sb_info *cifs_sb,
+ bool lookup_only, struct cached_fid **cfid);
+extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry,
+ struct cached_fid **cfid);
+extern void close_cached_dir(struct cached_fid *cfid);
+extern void close_cached_dir_lease(struct cached_fid *cfid);
+extern void close_cached_dir_lease_locked(struct cached_fid *cfid);
+extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb);
+extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon);
+extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
+
+#endif /* _CACHED_DIR_H */
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9d334816eac0..c05477e28cff 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -36,13 +36,13 @@ cifs_dump_mem(char *label, void *data, int length)
void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct smb_hdr *smb = (struct smb_hdr *)buf;
+ struct smb_hdr *smb = buf;
cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
smb->Command, smb->Status.CifsError,
smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
cifs_dbg(VFS, "smb buf %p len %u\n", smb,
- server->ops->calc_smb_size(smb, server));
+ server->ops->calc_smb_size(smb));
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -55,7 +55,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
return;
cifs_dbg(VFS, "Dump pending requests:\n");
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
mid_entry->mid_state,
@@ -78,7 +78,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
mid_entry->resp_buf, 62);
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -116,7 +116,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
tcon->ses->server->ops->dump_share_caps(m, tcon);
if (tcon->use_witness)
seq_puts(m, " Witness");
-
+ if (tcon->broken_sparse_sup)
+ seq_puts(m, " nosparse");
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
seq_putc(m, '\n');
@@ -161,11 +162,12 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr);
else if (iface->sockaddr.ss_family == AF_INET6)
seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr);
+ if (!iface->is_active)
+ seq_puts(m, "\t\t[for-cleanup]\n");
}
static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
{
- struct list_head *tmp, *tmp1, *tmp2;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -181,14 +183,10 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
#endif /* CIFS_DEBUG2 */
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- list_for_each(tmp1, &ses->tcon_list) {
- tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp2, &tcon->openFileList) {
- cfile = list_entry(tmp2, struct cifsFileInfo,
- tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
seq_printf(m,
"0x%x 0x%llx 0x%x %d %d %d %pd",
tcon->tid,
@@ -215,11 +213,11 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
- struct list_head *tmp2, *tmp3;
struct mid_q_entry *mid_entry;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
+ struct cifs_server_iface *iface;
int c, i, j;
seq_puts(m,
@@ -377,16 +375,14 @@ skip_rdma:
seq_printf(m, "\n\n\tSessions: ");
i = 0;
- list_for_each(tmp2, &server->smb_ses_list) {
- ses = list_entry(tmp2, struct cifs_ses,
- smb_ses_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
i++;
if ((ses->serverDomain == NULL) ||
(ses->serverOS == NULL) ||
(ses->serverNOS == NULL)) {
seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ",
i, ses->ip_addr, ses->ses_count,
- ses->capabilities, ses->status);
+ ses->capabilities, ses->ses_status);
if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST)
seq_printf(m, "Guest ");
else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL)
@@ -398,7 +394,7 @@ skip_rdma:
"\n\tSMB session status: %d ",
i, ses->ip_addr, ses->serverDomain,
ses->ses_count, ses->serverOS, ses->serverNOS,
- ses->capabilities, ses->status);
+ ses->capabilities, ses->ses_status);
}
seq_printf(m, "\n\tSecurity type: %s ",
@@ -418,6 +414,8 @@ skip_rdma:
spin_lock(&ses->chan_lock);
if (CIFS_CHAN_NEEDS_RECONNECT(ses, 0))
seq_puts(m, "\tPrimary channel: DISCONNECTED ");
+ if (CIFS_CHAN_IN_RECONNECT(ses, 0))
+ seq_puts(m, "\t[RECONNECTING] ");
if (ses->chan_count > 1) {
seq_printf(m, "\n\n\tExtra Channels: %zu ",
@@ -426,6 +424,8 @@ skip_rdma:
cifs_dump_channel(m, j, &ses->chans[j]);
if (CIFS_CHAN_NEEDS_RECONNECT(ses, j))
seq_puts(m, "\tDISCONNECTED ");
+ if (CIFS_CHAN_IN_RECONNECT(ses, j))
+ seq_puts(m, "\t[RECONNECTING] ");
}
}
spin_unlock(&ses->chan_lock);
@@ -439,9 +439,7 @@ skip_rdma:
else
seq_puts(m, "none\n");
- list_for_each(tmp3, &ses->tcon_list) {
- tcon = list_entry(tmp3, struct cifs_tcon,
- tcon_list);
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
++j;
seq_printf(m, "\n\t%d) ", j);
cifs_debug_tcon(m, tcon);
@@ -451,11 +449,10 @@ skip_rdma:
if (ses->iface_count)
seq_printf(m, "\n\n\tServer interfaces: %zu",
ses->iface_count);
- for (j = 0; j < ses->iface_count; j++) {
- struct cifs_server_iface *iface;
-
- iface = &ses->iface_list[j];
- seq_printf(m, "\n\t%d)", j+1);
+ j = 0;
+ list_for_each_entry(iface, &ses->iface_list,
+ iface_head) {
+ seq_printf(m, "\n\t%d)", ++j);
cifs_dump_iface(m, iface);
if (is_ses_using_iface(ses, iface))
seq_puts(m, "\t\t[CONNECTED]\n");
@@ -466,10 +463,8 @@ skip_rdma:
seq_printf(m, "\n\t\t[NONE]");
seq_puts(m, "\n\n\tMIDs: ");
- spin_lock(&GlobalMid_Lock);
- list_for_each(tmp3, &server->pending_mid_q) {
- mid_entry = list_entry(tmp3, struct mid_q_entry,
- qhead);
+ spin_lock(&server->mid_lock);
+ list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
seq_printf(m, "\n\tState: %d com: %d pid:"
" %d cbdata: %p mid %llu\n",
mid_entry->mid_state,
@@ -478,7 +473,7 @@ skip_rdma:
mid_entry->callback_data,
mid_entry->mid);
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
seq_printf(m, "\n--\n");
}
if (c == 0)
@@ -497,7 +492,6 @@ static ssize_t cifs_stats_proc_write(struct file *file,
{
bool bv;
int rc;
- struct list_head *tmp1, *tmp2, *tmp3;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -507,8 +501,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
#ifdef CONFIG_CIFS_STATS2
int i;
- atomic_set(&totBufAllocCount, 0);
- atomic_set(&totSmBufAllocCount, 0);
+ atomic_set(&total_buf_alloc_count, 0);
+ atomic_set(&total_small_buf_alloc_count, 0);
#endif /* CONFIG_CIFS_STATS2 */
atomic_set(&tcpSesReconnectCount, 0);
atomic_set(&tconInfoReconnectCount, 0);
@@ -518,9 +512,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
GlobalCurrentXid = 0;
spin_unlock(&GlobalMid_Lock);
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp1, &cifs_tcp_ses_list) {
- server = list_entry(tmp1, struct TCP_Server_Info,
- tcp_ses_list);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
server->max_in_flight = 0;
#ifdef CONFIG_CIFS_STATS2
for (i = 0; i < NUMBER_OF_SMB2_COMMANDS; i++) {
@@ -531,13 +523,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
server->fastest_cmd[0] = 0;
}
#endif /* CONFIG_CIFS_STATS2 */
- list_for_each(tmp2, &server->smb_ses_list) {
- ses = list_entry(tmp2, struct cifs_ses,
- smb_ses_list);
- list_for_each(tmp3, &ses->tcon_list) {
- tcon = list_entry(tmp3,
- struct cifs_tcon,
- tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
atomic_set(&tcon->num_smbs_sent, 0);
spin_lock(&tcon->stat_lock);
tcon->bytes_read = 0;
@@ -562,7 +549,6 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CIFS_STATS2
int j;
#endif /* STATS2 */
- struct list_head *tmp2, *tmp3;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -572,17 +558,17 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
seq_printf(m, "Share (unique mount targets): %d\n",
tconInfoAllocCount.counter);
seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
- bufAllocCount.counter,
+ buf_alloc_count.counter,
cifs_min_rcv + tcpSesAllocCount.counter);
seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
- smBufAllocCount.counter, cifs_min_small);
+ small_buf_alloc_count.counter, cifs_min_small);
#ifdef CONFIG_CIFS_STATS2
seq_printf(m, "Total Large %d Small %d Allocations\n",
- atomic_read(&totBufAllocCount),
- atomic_read(&totSmBufAllocCount));
+ atomic_read(&total_buf_alloc_count),
+ atomic_read(&total_small_buf_alloc_count));
#endif /* CONFIG_CIFS_STATS2 */
- seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount));
+ seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&mid_count));
seq_printf(m,
"\n%d session %d share reconnects\n",
tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
@@ -612,13 +598,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
atomic_read(&server->smb2slowcmd[j]),
server->hostname, j);
#endif /* STATS2 */
- list_for_each(tmp2, &server->smb_ses_list) {
- ses = list_entry(tmp2, struct cifs_ses,
- smb_ses_list);
- list_for_each(tmp3, &ses->tcon_list) {
- tcon = list_entry(tmp3,
- struct cifs_tcon,
- tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
i++;
seq_printf(m, "\n%d) %s", i, tcon->treeName);
if (tcon->need_reconnect)
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 180c234c2f46..1e4c7cc5287f 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -465,7 +465,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
int ret = 0;
/* Store the reconnect address */
- mutex_lock(&tcon->ses->server->srv_mutex);
+ cifs_server_lock(tcon->ses->server);
if (cifs_sockaddr_equal(&tcon->ses->server->dstaddr, addr))
goto unlock;
@@ -501,7 +501,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
cifs_signal_cifsd_for_reconnect(tcon->ses->server, false);
unlock:
- mutex_unlock(&tcon->ses->server->srv_mutex);
+ cifs_server_unlock(tcon->ses->server);
return ret;
}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index bf861fef2f0c..fa480d62f313 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1379,6 +1379,7 @@ chown_chgrp_exit:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen,
u32 __maybe_unused unused)
@@ -1512,6 +1513,7 @@ out:
cifs_put_tlink(tlink);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */
int
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 0912d8bbbac1..46f5718754f9 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -32,10 +32,9 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
int rc;
struct kvec *iov = rqst->rq_iov;
int n_vec = rqst->rq_nvec;
- int is_smb2 = server->vals->header_preamble_size == 0;
/* iov[0] is actual data and not the rfc1002 length for SMB2+ */
- if (is_smb2) {
+ if (!is_smb1(server)) {
if (iov[0].iov_len <= 4)
return -EIO;
i = 0;
@@ -141,13 +140,13 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
if ((cifs_pdu == NULL) || (server == NULL))
return -EINVAL;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
server->tcpStatus == CifsNeedNegotiate) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return rc;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (!server->session_estab) {
memcpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
@@ -236,9 +235,9 @@ int cifs_verify_signature(struct smb_rqst *rqst,
cpu_to_le32(expected_sequence_number);
cifs_pdu->Signature.Sequence.Reserved = 0;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc)
return rc;
@@ -626,7 +625,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
memcpy(ses->auth_key.response + baselen, tiblob, tilen);
- mutex_lock(&ses->server->srv_mutex);
+ cifs_server_lock(ses->server);
rc = cifs_alloc_hash("hmac(md5)",
&ses->server->secmech.hmacmd5,
@@ -678,7 +677,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
unlock:
- mutex_unlock(&ses->server->srv_mutex);
+ cifs_server_unlock(ses->server);
setup_ntlmv2_rsp_ret:
kfree(tiblob);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a47fa44b6d52..8042d7280dec 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -46,6 +46,7 @@
#include "netlink.h"
#endif
#include "fs_context.h"
+#include "cached_dir.h"
/*
* DOS dates from 1980/1/1 through 2107/12/31
@@ -68,6 +69,34 @@ bool enable_negotiate_signing; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
+
+/*
+ * Global transaction id (XID) information
+ */
+unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
+unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
+unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
+spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
+
+/*
+ * Global counters, updated atomically
+ */
+atomic_t sesInfoAllocCount;
+atomic_t tconInfoAllocCount;
+atomic_t tcpSesNextId;
+atomic_t tcpSesAllocCount;
+atomic_t tcpSesReconnectCount;
+atomic_t tconInfoReconnectCount;
+
+atomic_t mid_count;
+atomic_t buf_alloc_count;
+atomic_t small_buf_alloc_count;
+#ifdef CONFIG_CIFS_STATS2
+atomic_t total_buf_alloc_count;
+atomic_t total_small_buf_alloc_count;
+#endif/* STATS2 */
+struct list_head cifs_tcp_ses_list;
+spinlock_t cifs_tcp_ses_lock;
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, uint, 0444);
@@ -255,33 +284,18 @@ out_no_root:
static void cifs_kill_sb(struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct cifs_tcon *tcon;
- struct cached_fid *cfid;
- struct rb_root *root = &cifs_sb->tlink_tree;
- struct rb_node *node;
- struct tcon_link *tlink;
/*
* We ned to release all dentries for the cached directories
* before we kill the sb.
*/
if (cifs_sb->root) {
+ close_all_cached_dirs(cifs_sb);
+
+ /* finally release root dentry */
dput(cifs_sb->root);
cifs_sb->root = NULL;
}
- node = rb_first(root);
- while (node != NULL) {
- tlink = rb_entry(node, struct tcon_link, tl_rbnode);
- tcon = tlink_tcon(tlink);
- cfid = &tcon->crfid;
- mutex_lock(&cfid->fid_mutex);
- if (cfid->dentry) {
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
- mutex_unlock(&cfid->fid_mutex);
- node = rb_next(node);
- }
kill_anon_super(sb);
cifs_umount(cifs_sb);
@@ -375,7 +389,7 @@ cifs_alloc_inode(struct super_block *sb)
cifs_inode->flags = 0;
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
- cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
+ cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
@@ -387,12 +401,12 @@ cifs_alloc_inode(struct super_block *sb)
* Can not set i_flags here - they get immediately overwritten to zero
* by the VFS.
*/
- /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
+ /* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */
INIT_LIST_HEAD(&cifs_inode->openFileList);
INIT_LIST_HEAD(&cifs_inode->llist);
INIT_LIST_HEAD(&cifs_inode->deferred_closes);
spin_lock_init(&cifs_inode->deferred_lock);
- return &cifs_inode->vfs_inode;
+ return &cifs_inode->netfs.inode;
}
static void
@@ -580,6 +594,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nocase");
if (tcon->nodelete)
seq_puts(s, ",nodelete");
+ if (cifs_sb->ctx->no_sparse)
+ seq_puts(s, ",nosparse");
if (tcon->local_lease)
seq_puts(s, ",locallease");
if (tcon->retry)
@@ -677,6 +693,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",acdirmax=%lu", cifs_sb->ctx->acdirmax / HZ);
seq_printf(s, ",acregmax=%lu", cifs_sb->ctx->acregmax / HZ);
}
+ seq_printf(s, ",closetimeo=%lu", cifs_sb->ctx->closetimeo / HZ);
if (tcon->ses->chan_max > 1)
seq_printf(s, ",multichannel,max_channels=%zu",
@@ -699,14 +716,17 @@ static void cifs_umount_begin(struct super_block *sb)
tcon = cifs_sb_master_tcon(cifs_sb);
spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) {
/* we have other mounts to same share or we have
already tried to force umount this and woken up
all waiting network requests, nothing to do */
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return;
} else if (tcon->tc_count == 1)
tcon->status = TID_EXITING;
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
@@ -834,7 +854,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
int flags, struct smb3_fs_context *old_ctx)
{
int rc;
- struct super_block *sb;
+ struct super_block *sb = NULL;
struct cifs_sb_info *cifs_sb = NULL;
struct cifs_mnt_data mnt_data;
struct dentry *root;
@@ -930,9 +950,11 @@ out_super:
return root;
out:
if (cifs_sb) {
- kfree(cifs_sb->prepath);
- smb3_cleanup_fs_context(cifs_sb->ctx);
- kfree(cifs_sb);
+ if (!sb || IS_ERR(sb)) { /* otherwise kill_sb will handle */
+ kfree(cifs_sb->prepath);
+ smb3_cleanup_fs_context(cifs_sb->ctx);
+ kfree(cifs_sb);
+ }
}
return root;
}
@@ -944,7 +966,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t rc;
struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_filp->f_flags & O_DIRECT)
+ if (iocb->ki_flags & IOCB_DIRECT)
return cifs_user_readv(iocb, iter);
rc = cifs_revalidate_mapping(inode);
@@ -1080,7 +1102,7 @@ struct file_system_type cifs_fs_type = {
};
MODULE_ALIAS_FS("cifs");
-static struct file_system_type smb3_fs_type = {
+struct file_system_type smb3_fs_type = {
.owner = THIS_MODULE,
.name = "smb3",
.init_fs_context = smb3_init_fs_context,
@@ -1226,6 +1248,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
lock_two_nondirectories(target_inode, src_inode);
cifs_dbg(FYI, "about to flush pages\n");
+
+ rc = filemap_write_and_wait_range(src_inode->i_mapping, off,
+ off + len - 1);
+ if (rc)
+ goto out;
+
/* should we flush first and last page first */
truncate_inode_pages(&target_inode->i_data, 0);
@@ -1412,7 +1440,7 @@ cifs_init_once(void *inode)
{
struct cifsInodeInfo *cifsi = inode;
- inode_init_once(&cifsi->vfs_inode);
+ inode_init_once(&cifsi->netfs.inode);
init_rwsem(&cifsi->lock_sem);
}
@@ -1531,8 +1559,7 @@ cifs_destroy_request_bufs(void)
kmem_cache_destroy(cifs_sm_req_cachep);
}
-static int
-cifs_init_mids(void)
+static int init_mids(void)
{
cifs_mid_cachep = kmem_cache_create("cifs_mpx_ids",
sizeof(struct mid_q_entry), 0,
@@ -1550,8 +1577,7 @@ cifs_init_mids(void)
return 0;
}
-static void
-cifs_destroy_mids(void)
+static void destroy_mids(void)
{
mempool_destroy(cifs_mid_poolp);
kmem_cache_destroy(cifs_mid_cachep);
@@ -1573,11 +1599,11 @@ init_cifs(void)
atomic_set(&tcpSesReconnectCount, 0);
atomic_set(&tconInfoReconnectCount, 0);
- atomic_set(&bufAllocCount, 0);
- atomic_set(&smBufAllocCount, 0);
+ atomic_set(&buf_alloc_count, 0);
+ atomic_set(&small_buf_alloc_count, 0);
#ifdef CONFIG_CIFS_STATS2
- atomic_set(&totBufAllocCount, 0);
- atomic_set(&totSmBufAllocCount, 0);
+ atomic_set(&total_buf_alloc_count, 0);
+ atomic_set(&total_small_buf_alloc_count, 0);
if (slow_rsp_threshold < 1)
cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
else if (slow_rsp_threshold > 32767)
@@ -1585,7 +1611,7 @@ init_cifs(void)
"slow response threshold set higher than recommended (0 to 32767)\n");
#endif /* CONFIG_CIFS_STATS2 */
- atomic_set(&midCount, 0);
+ atomic_set(&mid_count, 0);
GlobalCurrentXid = 0;
GlobalTotalActiveXid = 0;
GlobalMaxActiveXid = 0;
@@ -1648,7 +1674,7 @@ init_cifs(void)
if (rc)
goto out_destroy_deferredclose_wq;
- rc = cifs_init_mids();
+ rc = init_mids();
if (rc)
goto out_destroy_inodecache;
@@ -1705,7 +1731,7 @@ out_destroy_request_bufs:
#endif
cifs_destroy_request_bufs();
out_destroy_mids:
- cifs_destroy_mids();
+ destroy_mids();
out_destroy_inodecache:
cifs_destroy_inodecache();
out_destroy_deferredclose_wq:
@@ -1741,7 +1767,7 @@ exit_cifs(void)
dfs_cache_destroy();
#endif
cifs_destroy_request_bufs();
- cifs_destroy_mids();
+ destroy_mids();
cifs_destroy_inodecache();
destroy_workqueue(deferredclose_wq);
destroy_workqueue(cifsoplockd_wq);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 15a5c5db038b..5b4a7a32bdc5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -38,7 +38,7 @@ static inline unsigned long cifs_get_time(struct dentry *dentry)
return (unsigned long) dentry->d_fsdata;
}
-extern struct file_system_type cifs_fs_type;
+extern struct file_system_type cifs_fs_type, smb3_fs_type;
extern const struct address_space_operations cifs_addr_ops;
extern const struct address_space_operations cifs_addr_ops_smallbuf;
@@ -152,6 +152,7 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define SMB3_PRODUCT_BUILD 35
-#define CIFS_VERSION "2.35"
+/* when changing internal version - update following two lines at same time */
+#define SMB3_PRODUCT_BUILD 39
+#define CIFS_VERSION "2.39"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 8de977c359b1..ae7f571a7dba 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -16,6 +16,7 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/utsname.h>
+#include <linux/sched/mm.h>
#include <linux/netfs.h>
#include "cifs_fs_sb.h"
#include "cifsacl.h"
@@ -79,6 +80,9 @@
#define SMB_DNS_RESOLVE_INTERVAL_MIN 120
#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600
+/* smb multichannel query server interfaces interval in seconds */
+#define SMB_INTERFACE_POLL_INTERVAL 600
+
/* maximum number of PDUs in one compound */
#define MAX_COMPOUND 5
@@ -106,7 +110,7 @@
* CIFS vfs client Status information (based on what we know.)
*/
-/* associated with each tcp and smb session */
+/* associated with each connection */
enum statusEnum {
CifsNew = 0,
CifsGood,
@@ -114,8 +118,15 @@ enum statusEnum {
CifsNeedReconnect,
CifsNeedNegotiate,
CifsInNegotiate,
- CifsNeedSessSetup,
- CifsInSessSetup,
+};
+
+/* associated with each smb session */
+enum ses_status_enum {
+ SES_NEW = 0,
+ SES_GOOD,
+ SES_EXITING,
+ SES_NEED_RECON,
+ SES_IN_SETUP
};
/* associated with each tree connection to the server */
@@ -406,7 +417,7 @@ struct smb_version_operations {
int (*close_dir)(const unsigned int, struct cifs_tcon *,
struct cifs_fid *);
/* calculate a size of SMB message */
- unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi);
+ unsigned int (*calc_smb_size)(void *buf);
/* check for STATUS_PENDING and process the response if yes */
bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server);
/* check for STATUS_NETWORK_SESSION_EXPIRED */
@@ -546,6 +557,8 @@ struct smb_version_values {
#define HEADER_SIZE(server) (server->vals->header_size)
#define MAX_HEADER_SIZE(server) (server->vals->max_header_size)
+#define HEADER_PREAMBLE_SIZE(server) (server->vals->header_preamble_size)
+#define MID_HEADER_SIZE(server) (HEADER_SIZE(server) - 1 - HEADER_PREAMBLE_SIZE(server))
/**
* CIFS superblock mount flags (mnt_cifs_flags) to consider when
@@ -594,6 +607,7 @@ inc_rfc1001_len(void *buf, int count)
struct TCP_Server_Info {
struct list_head tcp_ses_list;
struct list_head smb_ses_list;
+ spinlock_t srv_lock; /* protect anything here that is not protected */
__u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
/* 15 character server name + 0x20 16th byte indicating type = srv */
@@ -611,6 +625,7 @@ struct TCP_Server_Info {
#endif
wait_queue_head_t response_q;
wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
+ spinlock_t mid_lock; /* protect mid queue and it's entries */
struct list_head pending_mid_q;
bool noblocksnd; /* use blocking sendmsg */
bool noautotune; /* do not autotune send buf sizes */
@@ -621,7 +636,8 @@ struct TCP_Server_Info {
unsigned int in_flight; /* number of requests on the wire to server */
unsigned int max_in_flight; /* max number of requests that were on wire */
spinlock_t req_lock; /* protect the two values above */
- struct mutex srv_mutex;
+ struct mutex _srv_mutex;
+ unsigned int nofs_flag;
struct task_struct *tsk;
char server_GUID[16];
__u16 sec_mode;
@@ -736,6 +752,27 @@ struct TCP_Server_Info {
#endif
};
+static inline bool is_smb1(struct TCP_Server_Info *server)
+{
+ return HEADER_PREAMBLE_SIZE(server) != 0;
+}
+
+static inline void cifs_server_lock(struct TCP_Server_Info *server)
+{
+ unsigned int nofs_flag = memalloc_nofs_save();
+
+ mutex_lock(&server->_srv_mutex);
+ server->nofs_flag = nofs_flag;
+}
+
+static inline void cifs_server_unlock(struct TCP_Server_Info *server)
+{
+ unsigned int nofs_flag = server->nofs_flag;
+
+ mutex_unlock(&server->_srv_mutex);
+ memalloc_nofs_restore(nofs_flag);
+}
+
struct cifs_credits {
unsigned int value;
unsigned int instance;
@@ -908,14 +945,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
#endif
struct cifs_server_iface {
+ struct list_head iface_head;
+ struct kref refcount;
size_t speed;
unsigned int rdma_capable : 1;
unsigned int rss_capable : 1;
+ unsigned int is_active : 1; /* unset if non existent */
struct sockaddr_storage sockaddr;
};
+/* release iface when last ref is dropped */
+static inline void
+release_iface(struct kref *ref)
+{
+ struct cifs_server_iface *iface = container_of(ref,
+ struct cifs_server_iface,
+ refcount);
+ list_del_init(&iface->iface_head);
+ kfree(iface);
+}
+
+/*
+ * compare two interfaces a and b
+ * return 0 if everything matches.
+ * return 1 if a has higher link speed, or rdma capable, or rss capable
+ * return -1 otherwise.
+ */
+static inline int
+iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b)
+{
+ int cmp_ret = 0;
+
+ WARN_ON(!a || !b);
+ if (a->speed == b->speed) {
+ if (a->rdma_capable == b->rdma_capable) {
+ if (a->rss_capable == b->rss_capable) {
+ cmp_ret = memcmp(&a->sockaddr, &b->sockaddr,
+ sizeof(a->sockaddr));
+ if (!cmp_ret)
+ return 0;
+ else if (cmp_ret > 0)
+ return 1;
+ else
+ return -1;
+ } else if (a->rss_capable > b->rss_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->rdma_capable > b->rdma_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->speed > b->speed)
+ return 1;
+ else
+ return -1;
+}
+
struct cifs_chan {
+ unsigned int in_reconnect : 1; /* if session setup in progress for this channel */
struct TCP_Server_Info *server;
+ struct cifs_server_iface *iface; /* interface in use */
__u8 signkey[SMB3_SIGN_KEY_SIZE];
};
@@ -927,10 +1017,11 @@ struct cifs_ses {
struct list_head rlist; /* reconnect list */
struct list_head tcon_list;
struct cifs_tcon *tcon_ipc;
+ spinlock_t ses_lock; /* protect anything here that is not protected */
struct mutex session_mutex;
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
- enum statusEnum status; /* updates protected by cifs_tcp_ses_lock */
+ enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */
unsigned overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
@@ -944,7 +1035,7 @@ struct cifs_ses {
and after mount option parsing we fill it */
char *domainName;
char *password;
- char *workstation_name;
+ char workstation_name[CIFS_MAX_WORKSTATION_LEN];
struct session_key auth_key;
struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
enum securityEnum sectype; /* what security flavor was specified? */
@@ -967,7 +1058,7 @@ struct cifs_ses {
*/
spinlock_t iface_lock;
/* ========= begin: protected by iface_lock ======== */
- struct cifs_server_iface *iface_list;
+ struct list_head iface_list;
size_t iface_count;
unsigned long iface_last_update; /* jiffies */
/* ========= end: protected by iface_lock ======== */
@@ -977,12 +1068,16 @@ struct cifs_ses {
#define CIFS_MAX_CHANNELS 16
#define CIFS_ALL_CHANNELS_SET(ses) \
((1UL << (ses)->chan_count) - 1)
+#define CIFS_ALL_CHANS_GOOD(ses) \
+ (!(ses)->chans_need_reconnect)
#define CIFS_ALL_CHANS_NEED_RECONNECT(ses) \
((ses)->chans_need_reconnect == CIFS_ALL_CHANNELS_SET(ses))
#define CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses) \
((ses)->chans_need_reconnect = CIFS_ALL_CHANNELS_SET(ses))
#define CIFS_CHAN_NEEDS_RECONNECT(ses, index) \
test_bit((index), &(ses)->chans_need_reconnect)
+#define CIFS_CHAN_IN_RECONNECT(ses, index) \
+ ((ses)->chans[(index)].in_reconnect)
struct cifs_chan chans[CIFS_MAX_CHANNELS];
size_t chan_count;
@@ -1009,18 +1104,35 @@ cap_unix(struct cifs_ses *ses)
return ses->server->vals->cap_unix & ses->capabilities;
}
-struct cached_fid {
- bool is_valid:1; /* Do we have a useable root fid */
- bool file_all_info_is_valid:1;
- bool has_lease:1;
- unsigned long time; /* jiffies of when lease was taken */
- struct kref refcount;
- struct cifs_fid *fid;
- struct mutex fid_mutex;
- struct cifs_tcon *tcon;
- struct dentry *dentry;
- struct work_struct lease_break;
- struct smb2_file_all_info file_all_info;
+/*
+ * common struct for holding inode info when searching for or updating an
+ * inode with new info
+ */
+
+#define CIFS_FATTR_DFS_REFERRAL 0x1
+#define CIFS_FATTR_DELETE_PENDING 0x2
+#define CIFS_FATTR_NEED_REVAL 0x4
+#define CIFS_FATTR_INO_COLLISION 0x8
+#define CIFS_FATTR_UNKNOWN_NLINK 0x10
+#define CIFS_FATTR_FAKE_ROOT_INO 0x20
+
+struct cifs_fattr {
+ u32 cf_flags;
+ u32 cf_cifsattrs;
+ u64 cf_uniqueid;
+ u64 cf_eof;
+ u64 cf_bytes;
+ u64 cf_createtime;
+ kuid_t cf_uid;
+ kgid_t cf_gid;
+ umode_t cf_mode;
+ dev_t cf_rdev;
+ unsigned int cf_nlink;
+ unsigned int cf_dtype;
+ struct timespec64 cf_atime;
+ struct timespec64 cf_mtime;
+ struct timespec64 cf_ctime;
+ u32 cf_cifstag;
};
/*
@@ -1031,6 +1143,7 @@ struct cifs_tcon {
struct list_head tcon_list;
int tc_count;
struct list_head rlist; /* reconnect list */
+ spinlock_t tc_lock; /* protect anything here that is not protected */
atomic_t num_local_opens; /* num of all opens including disconnected */
atomic_t num_remote_opens; /* num of all network opens on server */
struct list_head openFileList;
@@ -1115,11 +1228,12 @@ struct cifs_tcon {
struct fscache_volume *fscache; /* cookie for share */
#endif
struct list_head pending_opens; /* list of incomplete opens */
- struct cached_fid crfid; /* Cached root fid */
+ struct cached_fid *cfid; /* Cached root fid */
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
struct list_head ulist; /* cache update list */
#endif
+ struct delayed_work query_interfaces; /* query interfaces workqueue job */
};
/*
@@ -1396,20 +1510,16 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
#define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
#define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
-#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))
+#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))
#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
-#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))
+#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))
/*
* One of these for each file inode
*/
struct cifsInodeInfo {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
/*
@@ -1448,7 +1558,7 @@ struct cifsInodeInfo {
static inline struct cifsInodeInfo *
CIFS_I(struct inode *inode)
{
- return container_of(inode, struct cifsInodeInfo, vfs_inode);
+ return container_of(inode, struct cifsInodeInfo, netfs.inode);
}
static inline struct cifs_sb_info *
@@ -1641,37 +1751,6 @@ struct file_list {
struct cifsFileInfo *cfile;
};
-/*
- * common struct for holding inode info when searching for or updating an
- * inode with new info
- */
-
-#define CIFS_FATTR_DFS_REFERRAL 0x1
-#define CIFS_FATTR_DELETE_PENDING 0x2
-#define CIFS_FATTR_NEED_REVAL 0x4
-#define CIFS_FATTR_INO_COLLISION 0x8
-#define CIFS_FATTR_UNKNOWN_NLINK 0x10
-#define CIFS_FATTR_FAKE_ROOT_INO 0x20
-
-struct cifs_fattr {
- u32 cf_flags;
- u32 cf_cifsattrs;
- u64 cf_uniqueid;
- u64 cf_eof;
- u64 cf_bytes;
- u64 cf_createtime;
- kuid_t cf_uid;
- kgid_t cf_gid;
- umode_t cf_mode;
- dev_t cf_rdev;
- unsigned int cf_nlink;
- unsigned int cf_dtype;
- struct timespec64 cf_atime;
- struct timespec64 cf_mtime;
- struct timespec64 cf_ctime;
- u32 cf_cifstag;
-};
-
static inline void free_dfs_info_param(struct dfs_info3_param *param)
{
if (param) {
@@ -1795,33 +1874,78 @@ require use of the stronger protocol */
*/
/****************************************************************************
- * Locking notes. All updates to global variables and lists should be
- * protected by spinlocks or semaphores.
+ * Here are all the locks (spinlock, mutex, semaphore) in cifs.ko, arranged according
+ * to the locking order. i.e. if two locks are to be held together, the lock that
+ * appears higher in this list needs to be taken before the other.
+ *
+ * If you hold a lock that is lower in this list, and you need to take a higher lock
+ * (or if you think that one of the functions that you're calling may need to), first
+ * drop the lock you hold, pick up the higher lock, then the lower one. This will
+ * ensure that locks are picked up only in one direction in the below table
+ * (top to bottom).
+ *
+ * Also, if you expect a function to be called with a lock held, explicitly document
+ * this in the comments on top of your function definition.
*
- * Spinlocks
- * ---------
- * GlobalMid_Lock protects:
- * list operations on pending_mid_q and oplockQ
- * updates to XID counters, multiplex id and SMB sequence numbers
- * list operations on global DnotifyReqList
- * updates to ses->status and TCP_Server_Info->tcpStatus
- * updates to server->CurrentMid
- * tcp_ses_lock protects:
- * list operations on tcp and SMB session lists
- * tcon->open_file_lock protects the list of open files hanging off the tcon
- * inode->open_file_lock protects the openFileList hanging off the inode
- * cfile->file_info_lock protects counters and fields in cifs file struct
- * f_owner.lock protects certain per file struct operations
- * mapping->page_lock protects certain per page operations
+ * And also, try to keep the critical sections (lock hold time) to be as minimal as
+ * possible. Blocking / calling other functions with a lock held always increase
+ * the risk of a possible deadlock.
*
- * Note that the cifs_tcon.open_file_lock should be taken before
- * not after the cifsInodeInfo.open_file_lock
+ * Following this rule will avoid unnecessary deadlocks, which can get really hard to
+ * debug. Also, any new lock that you introduce, please add to this list in the correct
+ * order.
*
- * Semaphores
- * ----------
- * cifsInodeInfo->lock_sem protects:
- * the list of locks held by the inode
+ * Please populate this list whenever you introduce new locks in your changes. Or in
+ * case I've missed some existing locks. Please ensure that it's added in the list
+ * based on the locking order expected.
*
+ * =====================================================================================
+ * Lock Protects Initialization fn
+ * =====================================================================================
+ * vol_list_lock
+ * vol_info->ctx_lock vol_info->ctx
+ * cifs_sb_info->tlink_tree_lock cifs_sb_info->tlink_tree cifs_setup_cifs_sb
+ * TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session
+ * reconnect_mutex
+ * TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session
+ * cifs_ses->session_mutex cifs_ses sesInfoAlloc
+ * cifs_tcon
+ * cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc
+ * cifs_tcon->pending_opens
+ * cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc
+ * cifs_tcon->bytes_written
+ * cifs_tcp_ses_lock cifs_tcp_ses_list sesInfoAlloc
+ * GlobalMid_Lock GlobalMaxActiveXid init_cifs
+ * GlobalCurrentXid
+ * GlobalTotalActiveXid
+ * TCP_Server_Info->srv_lock (anything in struct not protected by another lock and can change)
+ * TCP_Server_Info->mid_lock TCP_Server_Info->pending_mid_q cifs_get_tcp_session
+ * ->CurrentMid
+ * (any changes in mid_q_entry fields)
+ * TCP_Server_Info->req_lock TCP_Server_Info->in_flight cifs_get_tcp_session
+ * ->credits
+ * ->echo_credits
+ * ->oplock_credits
+ * ->reconnect_instance
+ * cifs_ses->ses_lock (anything that is not protected by another lock and can change)
+ * cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc
+ * ->iface_count
+ * ->iface_last_update
+ * cifs_ses->chan_lock cifs_ses->chans
+ * ->chans_need_reconnect
+ * ->chans_in_reconnect
+ * cifs_tcon->tc_lock (anything that is not protected by another lock and can change)
+ * cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode
+ * cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc
+ * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once
+ * ->can_cache_brlcks
+ * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc
+ * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc
+ * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
+ * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
+ * ->invalidHandle initiate_cifs_search
+ * ->oplock_break_cancelled
+ * cifs_aio_ctx->aio_mutex cifs_aio_ctx cifs_aio_ctx_alloc
****************************************************************************/
#ifdef DECLARE_GLOBALS_HERE
@@ -1837,47 +1961,44 @@ require use of the stronger protocol */
* sessions (and from that the tree connections) can be found
* by iterating over cifs_tcp_ses_list
*/
-GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
+extern struct list_head cifs_tcp_ses_list;
/*
* This lock protects the cifs_tcp_ses_list, the list of smb sessions per
* tcp session, and the list of tcon's per smb session. It also protects
- * the reference counters for the server, smb session, and tcon. It also
- * protects some fields in the TCP_Server_Info struct such as dstaddr. Finally,
- * changes to the tcon->tidStatus should be done while holding this lock.
+ * the reference counters for the server, smb session, and tcon.
* generally the locks should be taken in order tcp_ses_lock before
* tcon->open_file_lock and that before file->file_info_lock since the
* structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file
*/
-GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
+extern spinlock_t cifs_tcp_ses_lock;
/*
* Global transaction id (XID) information
*/
-GLOBAL_EXTERN unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
-GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
-GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
-GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */
- /* on midQ entries */
+extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
+extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
+extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
+extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
+
/*
* Global counters, updated atomically
*/
-GLOBAL_EXTERN atomic_t sesInfoAllocCount;
-GLOBAL_EXTERN atomic_t tconInfoAllocCount;
-GLOBAL_EXTERN atomic_t tcpSesNextId;
-GLOBAL_EXTERN atomic_t tcpSesAllocCount;
-GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
-GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
+extern atomic_t sesInfoAllocCount;
+extern atomic_t tconInfoAllocCount;
+extern atomic_t tcpSesNextId;
+extern atomic_t tcpSesAllocCount;
+extern atomic_t tcpSesReconnectCount;
+extern atomic_t tconInfoReconnectCount;
/* Various Debug counters */
-GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */
+extern atomic_t buf_alloc_count; /* current number allocated */
+extern atomic_t small_buf_alloc_count;
#ifdef CONFIG_CIFS_STATS2
-GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
-GLOBAL_EXTERN atomic_t totSmBufAllocCount;
+extern atomic_t total_buf_alloc_count; /* total allocated over all time */
+extern atomic_t total_small_buf_alloc_count;
extern unsigned int slow_rsp_threshold; /* number of secs before logging */
#endif
-GLOBAL_EXTERN atomic_t smBufAllocCount;
-GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
extern bool enable_oplocks; /* enable or disable oplocks */
@@ -1894,6 +2015,7 @@ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
extern unsigned int cifs_min_small; /* min size of small buf pool */
extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
+extern atomic_t mid_count;
void cifs_oplock_break(struct work_struct *work);
void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
@@ -1911,11 +2033,13 @@ extern mempool_t *cifs_mid_poolp;
/* Operations for different SMB versions */
#define SMB1_VERSION_STRING "1.0"
+#define SMB20_VERSION_STRING "2.0"
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
extern struct smb_version_operations smb1_operations;
extern struct smb_version_values smb1_values;
-#define SMB20_VERSION_STRING "2.0"
extern struct smb_version_operations smb20_operations;
extern struct smb_version_values smb20_values;
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
#define SMB21_VERSION_STRING "2.1"
extern struct smb_version_operations smb21_operations;
extern struct smb_version_values smb21_values;
@@ -1979,4 +2103,22 @@ static inline bool cifs_is_referral_server(struct cifs_tcon *tcon,
return is_tcon_dfs(tcon) || (ref && (ref->flags & DFSREF_REFERRAL_SERVER));
}
+static inline u64 cifs_flock_len(const struct file_lock *fl)
+{
+ return (u64)fl->fl_end - fl->fl_start + 1;
+}
+
+static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
+{
+ if (WARN_ON_ONCE(!ses || !ses->server))
+ return 0;
+ /*
+ * Make workstation name no more than 15 chars when using insecure dialects as some legacy
+ * servers do require it during NTLMSSP.
+ */
+ if (ses->server->dialect <= SMB20_PROT_ID)
+ return min_t(size_t, sizeof(ses->workstation_name), RFC1001_NAME_LEN_WITH_NULL);
+ return sizeof(ses->workstation_name);
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0df3b24a0bf4..3bc94bcc7177 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,12 +78,8 @@ extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
extern char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath, const struct dfs_info3_param *ref,
char **devname);
-/* extern void renew_parental_timestamps(struct dentry *direntry);*/
-extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
- struct TCP_Server_Info *server);
-extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
-extern void cifs_delete_mid(struct mid_q_entry *mid);
-extern void cifs_mid_q_entry_release(struct mid_q_entry *midEntry);
+extern void delete_mid(struct mid_q_entry *mid);
+extern void release_mid(struct mid_q_entry *mid);
extern void cifs_wake_up_task(struct mid_q_entry *mid);
extern int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
@@ -155,7 +151,7 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
struct cifsFileInfo **ret_file);
-extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server);
+extern unsigned int smbCalcSize(void *buf);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -521,6 +517,7 @@ extern int generate_smb30signingkey(struct cifs_ses *ses,
extern int generate_smb311signingkey(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
extern int CIFSSMBCopy(unsigned int xid,
struct cifs_tcon *source_tcon,
const char *fromName,
@@ -551,6 +548,7 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage, int remap_special_chars);
extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
@@ -599,7 +597,6 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
void cifs_aio_ctx_release(struct kref *refcount);
int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
-void smb2_cached_lease_break(struct work_struct *work);
int cifs_alloc_hash(const char *name, struct crypto_shash **shash,
struct sdesc **sdesc);
@@ -619,6 +616,15 @@ unsigned int
cifs_ses_get_chan_index(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
+cifs_chan_set_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+void
+cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+bool
+cifs_chan_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+void
cifs_chan_set_need_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
@@ -627,6 +633,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
bool
cifs_chan_needs_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
+int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/cifsroot.c b/fs/cifs/cifsroot.c
index 9e91a5a40aae..56ec1b233f52 100644
--- a/fs/cifs/cifsroot.c
+++ b/fs/cifs/cifsroot.c
@@ -59,7 +59,7 @@ static int __init cifs_root_setup(char *line)
pr_err("Root-CIFS: UNC path too long\n");
return 1;
}
- strlcpy(root_dev, line, len);
+ strscpy(root_dev, line, len);
srvaddr = parse_srvaddr(&line[2], s);
if (*s) {
int n = snprintf(root_opts,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 47e927c4ff8d..7aa91e272027 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -29,7 +29,6 @@
#include "cifsproto.h"
#include "cifs_unicode.h"
#include "cifs_debug.h"
-#include "smb2proto.h"
#include "fscache.h"
#include "smbdirect.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -62,52 +61,6 @@ static struct {
#define CIFS_NUM_PROT 1
#endif /* CIFS_POSIX */
-/*
- * Mark as invalid, all open files on tree connections since they
- * were closed when session to server was lost.
- */
-void
-cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
-{
- struct cifsFileInfo *open_file = NULL;
- struct list_head *tmp;
- struct list_head *tmp1;
-
- /* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
- if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) {
- spin_unlock(&cifs_tcp_ses_lock);
- return;
- }
- tcon->status = TID_IN_FILES_INVALIDATE;
- spin_unlock(&cifs_tcp_ses_lock);
-
- /* list all files open on tree connection and mark them invalid */
- spin_lock(&tcon->open_file_lock);
- list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
- open_file = list_entry(tmp, struct cifsFileInfo, tlist);
- open_file->invalidHandle = true;
- open_file->oplock_break_cancelled = true;
- }
- spin_unlock(&tcon->open_file_lock);
-
- mutex_lock(&tcon->crfid.fid_mutex);
- tcon->crfid.is_valid = false;
- /* cached handle is not valid, so SMB2_CLOSE won't be sent below */
- close_cached_dir_lease_locked(&tcon->crfid);
- memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid));
- mutex_unlock(&tcon->crfid.fid_mutex);
-
- spin_lock(&cifs_tcp_ses_lock);
- if (tcon->status == TID_IN_FILES_INVALIDATE)
- tcon->status = TID_NEED_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
-
- /*
- * BB Add call to invalidate_inodes(sb) for all superblocks mounted
- * to this tcon.
- */
-}
/* reconnect the socket, tcon, and smb session if needed */
static int
@@ -134,18 +87,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* only tree disconnect, open, and write, (and ulogoff which does not
* have tcon) are allowed as we start force umount
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_EXITING) {
if (smb_command != SMB_COM_WRITE_ANDX &&
smb_command != SMB_COM_OPEN_ANDX &&
smb_command != SMB_COM_TREE_DISCONNECT) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
cifs_dbg(FYI, "can not send cmd %d while umounting\n",
smb_command);
return -ENODEV;
}
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
retries = server->nr_targets;
@@ -165,12 +118,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
/* are we still trying to reconnect? */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
break;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (retries && --retries)
continue;
@@ -201,13 +154,13 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = -EHOSTDOWN;
goto out;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* need to prevent multiple threads trying to simultaneously
@@ -457,52 +410,6 @@ decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
return 0;
}
-int
-cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
-{
- bool srv_sign_required = server->sec_mode & server->vals->signing_required;
- bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
- bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
-
- /*
- * Is signing required by mnt options? If not then check
- * global_secflags to see if it is there.
- */
- if (!mnt_sign_required)
- mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
- CIFSSEC_MUST_SIGN);
-
- /*
- * If signing is required then it's automatically enabled too,
- * otherwise, check to see if the secflags allow it.
- */
- mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
- (global_secflags & CIFSSEC_MAY_SIGN);
-
- /* If server requires signing, does client allow it? */
- if (srv_sign_required) {
- if (!mnt_sign_enabled) {
- cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!\n");
- return -ENOTSUPP;
- }
- server->sign = true;
- }
-
- /* If client requires signing, does server allow it? */
- if (mnt_sign_required) {
- if (!srv_sign_enabled) {
- cifs_dbg(VFS, "Server does not support signing!\n");
- return -ENOTSUPP;
- }
- server->sign = true;
- }
-
- if (cifs_rdma_enabled(server) && server->sign)
- cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled\n");
-
- return 0;
-}
-
static bool
should_set_ext_sec_flag(enum securityEnum sectype)
{
@@ -684,7 +591,7 @@ cifs_echo_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = mid->callback_data;
struct cifs_credits credits = { .value = 1, .instance = 0 };
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, CIFS_ECHO_OP);
}
@@ -1379,184 +1286,6 @@ openRetry:
return rc;
}
-/*
- * Discard any remaining data in the current SMB. To do this, we borrow the
- * current bigbuf.
- */
-int
-cifs_discard_remaining_data(struct TCP_Server_Info *server)
-{
- unsigned int rfclen = server->pdu_size;
- int remaining = rfclen + server->vals->header_preamble_size -
- server->total_read;
-
- while (remaining > 0) {
- int length;
-
- length = cifs_discard_from_socket(server,
- min_t(size_t, remaining,
- CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
- if (length < 0)
- return length;
- server->total_read += length;
- remaining -= length;
- }
-
- return 0;
-}
-
-static int
-__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
- bool malformed)
-{
- int length;
-
- length = cifs_discard_remaining_data(server);
- dequeue_mid(mid, malformed);
- mid->resp_buf = server->smallbuf;
- server->smallbuf = NULL;
- return length;
-}
-
-static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
-{
- struct cifs_readdata *rdata = mid->callback_data;
-
- return __cifs_readv_discard(server, mid, rdata->result);
-}
-
-int
-cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
-{
- int length, len;
- unsigned int data_offset, data_len;
- struct cifs_readdata *rdata = mid->callback_data;
- char *buf = server->smallbuf;
- unsigned int buflen = server->pdu_size +
- server->vals->header_preamble_size;
- bool use_rdma_mr = false;
-
- cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
- __func__, mid->mid, rdata->offset, rdata->bytes);
-
- /*
- * read the rest of READ_RSP header (sans Data array), or whatever we
- * can if there's not enough data. At this point, we've read down to
- * the Mid.
- */
- len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
- HEADER_SIZE(server) + 1;
-
- length = cifs_read_from_socket(server,
- buf + HEADER_SIZE(server) - 1, len);
- if (length < 0)
- return length;
- server->total_read += length;
-
- if (server->ops->is_session_expired &&
- server->ops->is_session_expired(buf)) {
- cifs_reconnect(server, true);
- return -1;
- }
-
- if (server->ops->is_status_pending &&
- server->ops->is_status_pending(buf, server)) {
- cifs_discard_remaining_data(server);
- return -1;
- }
-
- /* set up first two iov for signature check and to get credits */
- rdata->iov[0].iov_base = buf;
- rdata->iov[0].iov_len = server->vals->header_preamble_size;
- rdata->iov[1].iov_base = buf + server->vals->header_preamble_size;
- rdata->iov[1].iov_len =
- server->total_read - server->vals->header_preamble_size;
- cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
- rdata->iov[0].iov_base, rdata->iov[0].iov_len);
- cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
- rdata->iov[1].iov_base, rdata->iov[1].iov_len);
-
- /* Was the SMB read successful? */
- rdata->result = server->ops->map_error(buf, false);
- if (rdata->result != 0) {
- cifs_dbg(FYI, "%s: server returned error %d\n",
- __func__, rdata->result);
- /* normal error on read response */
- return __cifs_readv_discard(server, mid, false);
- }
-
- /* Is there enough to get to the rest of the READ_RSP header? */
- if (server->total_read < server->vals->read_rsp_size) {
- cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
- __func__, server->total_read,
- server->vals->read_rsp_size);
- rdata->result = -EIO;
- return cifs_readv_discard(server, mid);
- }
-
- data_offset = server->ops->read_data_offset(buf) +
- server->vals->header_preamble_size;
- if (data_offset < server->total_read) {
- /*
- * win2k8 sometimes sends an offset of 0 when the read
- * is beyond the EOF. Treat it as if the data starts just after
- * the header.
- */
- cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
- __func__, data_offset);
- data_offset = server->total_read;
- } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
- /* data_offset is beyond the end of smallbuf */
- cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
- __func__, data_offset);
- rdata->result = -EIO;
- return cifs_readv_discard(server, mid);
- }
-
- cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
- __func__, server->total_read, data_offset);
-
- len = data_offset - server->total_read;
- if (len > 0) {
- /* read any junk before data into the rest of smallbuf */
- length = cifs_read_from_socket(server,
- buf + server->total_read, len);
- if (length < 0)
- return length;
- server->total_read += length;
- }
-
- /* how much data is in the response? */
-#ifdef CONFIG_CIFS_SMB_DIRECT
- use_rdma_mr = rdata->mr;
-#endif
- data_len = server->ops->read_data_length(buf, use_rdma_mr);
- if (!use_rdma_mr && (data_offset + data_len > buflen)) {
- /* data_len is corrupt -- discard frame */
- rdata->result = -EIO;
- return cifs_readv_discard(server, mid);
- }
-
- length = rdata->read_into_pages(server, rdata, data_len);
- if (length < 0)
- return length;
-
- server->total_read += length;
-
- cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
- server->total_read, buflen, data_len);
-
- /* discard anything left over */
- if (server->total_read < buflen)
- return cifs_readv_discard(server, mid);
-
- dequeue_mid(mid, false);
- mid->resp_buf = server->smallbuf;
- server->smallbuf = NULL;
- return length;
-}
-
static void
cifs_readv_callback(struct mid_q_entry *mid)
{
@@ -1607,7 +1336,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &rdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -1909,183 +1638,6 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
return rc;
}
-void
-cifs_writedata_release(struct kref *refcount)
-{
- struct cifs_writedata *wdata = container_of(refcount,
- struct cifs_writedata, refcount);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr) {
- smbd_deregister_mr(wdata->mr);
- wdata->mr = NULL;
- }
-#endif
-
- if (wdata->cfile)
- cifsFileInfo_put(wdata->cfile);
-
- kvfree(wdata->pages);
- kfree(wdata);
-}
-
-/*
- * Write failed with a retryable error. Resend the write request. It's also
- * possible that the page was redirtied so re-clean the page.
- */
-static void
-cifs_writev_requeue(struct cifs_writedata *wdata)
-{
- int i, rc = 0;
- struct inode *inode = d_inode(wdata->cfile->dentry);
- struct TCP_Server_Info *server;
- unsigned int rest_len;
-
- server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- i = 0;
- rest_len = wdata->bytes;
- do {
- struct cifs_writedata *wdata2;
- unsigned int j, nr_pages, wsize, tailsz, cur_len;
-
- wsize = server->ops->wp_retry_size(inode);
- if (wsize < rest_len) {
- nr_pages = wsize / PAGE_SIZE;
- if (!nr_pages) {
- rc = -ENOTSUPP;
- break;
- }
- cur_len = nr_pages * PAGE_SIZE;
- tailsz = PAGE_SIZE;
- } else {
- nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
- cur_len = rest_len;
- tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
- }
-
- wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
- if (!wdata2) {
- rc = -ENOMEM;
- break;
- }
-
- for (j = 0; j < nr_pages; j++) {
- wdata2->pages[j] = wdata->pages[i + j];
- lock_page(wdata2->pages[j]);
- clear_page_dirty_for_io(wdata2->pages[j]);
- }
-
- wdata2->sync_mode = wdata->sync_mode;
- wdata2->nr_pages = nr_pages;
- wdata2->offset = page_offset(wdata2->pages[0]);
- wdata2->pagesz = PAGE_SIZE;
- wdata2->tailsz = tailsz;
- wdata2->bytes = cur_len;
-
- rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
- &wdata2->cfile);
- if (!wdata2->cfile) {
- cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
- rc);
- if (!is_retryable_error(rc))
- rc = -EBADF;
- } else {
- wdata2->pid = wdata2->cfile->pid;
- rc = server->ops->async_writev(wdata2,
- cifs_writedata_release);
- }
-
- for (j = 0; j < nr_pages; j++) {
- unlock_page(wdata2->pages[j]);
- if (rc != 0 && !is_retryable_error(rc)) {
- SetPageError(wdata2->pages[j]);
- end_page_writeback(wdata2->pages[j]);
- put_page(wdata2->pages[j]);
- }
- }
-
- kref_put(&wdata2->refcount, cifs_writedata_release);
- if (rc) {
- if (is_retryable_error(rc))
- continue;
- i += nr_pages;
- break;
- }
-
- rest_len -= cur_len;
- i += nr_pages;
- } while (i < wdata->nr_pages);
-
- /* cleanup remaining pages from the original wdata */
- for (; i < wdata->nr_pages; i++) {
- SetPageError(wdata->pages[i]);
- end_page_writeback(wdata->pages[i]);
- put_page(wdata->pages[i]);
- }
-
- if (rc != 0 && !is_retryable_error(rc))
- mapping_set_error(inode->i_mapping, rc);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-void
-cifs_writev_complete(struct work_struct *work)
-{
- struct cifs_writedata *wdata = container_of(work,
- struct cifs_writedata, work);
- struct inode *inode = d_inode(wdata->cfile->dentry);
- int i = 0;
-
- if (wdata->result == 0) {
- spin_lock(&inode->i_lock);
- cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
- spin_unlock(&inode->i_lock);
- cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
- wdata->bytes);
- } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
- return cifs_writev_requeue(wdata);
-
- for (i = 0; i < wdata->nr_pages; i++) {
- struct page *page = wdata->pages[i];
- if (wdata->result == -EAGAIN)
- __set_page_dirty_nobuffers(page);
- else if (wdata->result < 0)
- SetPageError(page);
- end_page_writeback(page);
- cifs_readpage_to_fscache(inode, page);
- put_page(page);
- }
- if (wdata->result != -EAGAIN)
- mapping_set_error(inode->i_mapping, wdata->result);
- kref_put(&wdata->refcount, cifs_writedata_release);
-}
-
-struct cifs_writedata *
-cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
-{
- struct page **pages =
- kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (pages)
- return cifs_writedata_direct_alloc(pages, complete);
-
- return NULL;
-}
-
-struct cifs_writedata *
-cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
-{
- struct cifs_writedata *wdata;
-
- wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
- if (wdata != NULL) {
- wdata->pages = pages;
- kref_init(&wdata->refcount);
- INIT_LIST_HEAD(&wdata->list);
- init_completion(&wdata->done);
- INIT_WORK(&wdata->work, complete);
- }
- return wdata;
-}
-
/*
* Check the mid_state and signature on received buffer (if any), and queue the
* workqueue completion task.
@@ -2132,7 +1684,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &wdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(tcon->ses->server, &credits, 0);
}
@@ -2558,7 +2110,8 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
pLockData->fl_start = le64_to_cpu(parm_data->start);
pLockData->fl_end = pLockData->fl_start +
- le64_to_cpu(parm_data->length) - 1;
+ (le64_to_cpu(parm_data->length) ?
+ le64_to_cpu(parm_data->length) - 1 : 0);
pLockData->fl_pid = -le32_to_cpu(parm_data->pid);
}
}
@@ -3659,7 +3212,6 @@ setACLerrorExit:
return rc;
}
-/* BB fix tabs in this function FIXME BB */
int
CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask)
@@ -3676,7 +3228,7 @@ CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
GetExtAttrRetry:
rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
- (void **) &pSMBr);
+ (void **) &pSMBr);
if (rc)
return rc;
@@ -3722,7 +3274,7 @@ GetExtAttrRetry:
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
struct file_chattr_info *pfinfo;
- /* BB Do we need a cast or hash here ? */
+
if (count != 16) {
cifs_dbg(FYI, "Invalid size ret in GetExtAttr\n");
rc = -EIO;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ee3b7c15e884..7ae6f2c08153 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -97,6 +97,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
if (!server->hostname)
return -EINVAL;
+ /* if server hostname isn't populated, there's nothing to do here */
+ if (server->hostname[0] == '\0')
+ return 0;
+
len = strlen(server->hostname) + 3;
unc = kmalloc(len, GFP_KERNEL);
@@ -115,10 +119,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
goto requeue_resolve;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
strlen(ipaddr));
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
kfree(ipaddr);
/* rc == 1 means success here */
@@ -141,6 +145,25 @@ requeue_resolve:
return rc;
}
+static void smb2_query_server_interfaces(struct work_struct *work)
+{
+ int rc;
+ struct cifs_tcon *tcon = container_of(work,
+ struct cifs_tcon,
+ query_interfaces.work);
+
+ /*
+ * query server network interfaces, in case they change
+ */
+ rc = SMB3_request_interfaces(0, tcon);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
+ }
+
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+}
static void cifs_resolve_server(struct work_struct *work)
{
@@ -148,7 +171,7 @@ static void cifs_resolve_server(struct work_struct *work)
struct TCP_Server_Info *server = container_of(work,
struct TCP_Server_Info, resolve.work);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
/*
* Resolve the hostname again to make sure that IP address is up-to-date.
@@ -159,7 +182,7 @@ static void cifs_resolve_server(struct work_struct *work)
__func__, rc);
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
}
/*
@@ -182,17 +205,22 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
/* If server is a channel, select the primary channel */
pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&pserver->srv_lock);
if (!all_channels) {
pserver->tcpStatus = CifsNeedReconnect;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&pserver->srv_lock);
return;
}
+ spin_unlock(&pserver->srv_lock);
+ spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
spin_lock(&ses->chan_lock);
- for (i = 0; i < ses->chan_count; i++)
+ for (i = 0; i < ses->chan_count; i++) {
+ spin_lock(&ses->chans[i].server->srv_lock);
ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&ses->chans[i].server->srv_lock);
+ }
spin_unlock(&ses->chan_lock);
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -213,7 +241,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
bool mark_smb_session)
{
struct TCP_Server_Info *pserver;
- struct cifs_ses *ses;
+ struct cifs_ses *ses, *nses;
struct cifs_tcon *tcon;
/*
@@ -227,7 +255,11 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
+ /* check if iface is still active */
+ if (!cifs_chan_is_iface_active(ses, server))
+ cifs_chan_update_iface(ses, server);
+
spin_lock(&ses->chan_lock);
if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))
goto next_session;
@@ -241,7 +273,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses))
goto next_session;
- ses->status = CifsNeedReconnect;
+ ses->ses_status = SES_NEED_RECON;
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
tcon->need_reconnect = true;
@@ -267,7 +299,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
/* do not want to be sending data on a socket we are freeing */
cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (server->ssocket) {
cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state,
server->ssocket->flags);
@@ -287,7 +319,7 @@ cifs_abort_connection(struct TCP_Server_Info *server)
/* mark submitted MIDs for retry and issue callback */
INIT_LIST_HEAD(&retry_list);
cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
kref_get(&mid->refcount);
if (mid->mid_state == MID_REQUEST_SUBMITTED)
@@ -295,30 +327,30 @@ cifs_abort_connection(struct TCP_Server_Info *server)
list_move(&mid->qhead, &retry_list);
mid->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
- mutex_unlock(&server->srv_mutex);
+ spin_unlock(&server->mid_lock);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_entry_safe(mid, nmid, &retry_list, qhead) {
list_del_init(&mid->qhead);
mid->callback(mid);
- cifs_mid_q_entry_release(mid);
+ release_mid(mid);
}
if (cifs_rdma_enabled(server)) {
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
smbd_destroy(server);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
}
}
static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num_targets)
{
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
server->nr_targets = num_targets;
if (server->tcpStatus == CifsExiting) {
/* the demux thread will exit normally next time through the loop */
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up(&server->response_q);
return false;
}
@@ -328,7 +360,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num
server->hostname);
server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return true;
}
@@ -359,7 +391,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
do {
try_to_freeze();
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (!cifs_swn_set_server_dstaddr(server)) {
/* resolve the hostname again to make sure that IP address is up-to-date */
@@ -372,26 +404,26 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
else
rc = generic_ip_connect(server);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
set_credits(server, 1);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_swn_reset_server_dstaddr(server);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
}
} while (server->tcpStatus == CifsNeedReconnect);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up(&server->response_q);
return rc;
@@ -453,9 +485,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_
return rc;
}
-static int
-reconnect_dfs_server(struct TCP_Server_Info *server,
- bool mark_smb_session)
+static int reconnect_dfs_server(struct TCP_Server_Info *server)
{
int rc = 0;
const char *refpath = server->current_fullpath + 1;
@@ -479,18 +509,23 @@ reconnect_dfs_server(struct TCP_Server_Info *server,
if (!cifs_tcp_ses_needs_reconnect(server, num_targets))
return 0;
- cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
+ /*
+ * Unconditionally mark all sessions & tcons for reconnect as we might be connecting to a
+ * different server or share during failover. It could be improved by adding some logic to
+ * only do that in case it connects to a different server or share, though.
+ */
+ cifs_mark_tcp_ses_conns_for_reconnect(server, true);
cifs_abort_connection(server);
do {
try_to_freeze();
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = reconnect_target_unlocked(server, &tl, &target_hint);
if (rc) {
/* Failed to reconnect socket */
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
msleep(3000);
continue;
@@ -502,12 +537,12 @@ reconnect_dfs_server(struct TCP_Server_Info *server,
*/
atomic_inc(&tcpSesReconnectCount);
set_credits(server, 1);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_swn_reset_server_dstaddr(server);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
} while (server->tcpStatus == CifsNeedReconnect);
@@ -517,11 +552,10 @@ reconnect_dfs_server(struct TCP_Server_Info *server,
dfs_cache_free_tgts(&tl);
/* Need to set up echo worker again once connection has been established */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
-
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up(&server->response_q);
return rc;
@@ -530,14 +564,21 @@ reconnect_dfs_server(struct TCP_Server_Info *server,
int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
{
/* If tcp session is not an dfs connection, then reconnect to last target server */
- spin_lock(&cifs_tcp_ses_lock);
- if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
+ if (!server->is_dfs_conn) {
+ spin_unlock(&server->srv_lock);
return __cifs_reconnect(server, mark_smb_session);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
- return reconnect_dfs_server(server, mark_smb_session);
+ mutex_lock(&server->refpath_lock);
+ if (!server->origin_fullpath || !server->leaf_fullpath) {
+ mutex_unlock(&server->refpath_lock);
+ return __cifs_reconnect(server, mark_smb_session);
+ }
+ mutex_unlock(&server->refpath_lock);
+
+ return reconnect_dfs_server(server);
}
#else
int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
@@ -624,18 +665,18 @@ server_unresponsive(struct TCP_Server_Info *server)
* 65s kernel_recvmsg times out, and we see that we haven't gotten
* a response in >60s.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
(!server->ops->can_echo || server->ops->can_echo(server)) &&
time_after(jiffies, server->lstrp + 3 * server->echo_interval)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
(3 * server->echo_interval) / HZ);
cifs_reconnect(server, false);
return true;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return false;
}
@@ -661,9 +702,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
int length = 0;
int total_read;
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
-
for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
try_to_freeze();
@@ -680,18 +718,18 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
else
length = sock_recvmsg(server->ssocket, smb_msg, 0);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ESHUTDOWN;
}
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_reconnect(server, false);
return -ECONNABORTED;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (length == -ERESTARTSYS ||
length == -EAGAIN ||
@@ -719,7 +757,7 @@ int
cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
@@ -729,15 +767,13 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
ssize_t
cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
/*
* iov_iter_discard already sets smb_msg.type and count and iov_offset
* and cifs_readv_from_socket sets msg_control and msg_controllen
* so little to initialize in struct msghdr
*/
- smb_msg.msg_name = NULL;
- smb_msg.msg_namelen = 0;
iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
return cifs_readv_from_socket(server, &smb_msg);
@@ -747,7 +783,7 @@ int
cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
unsigned int page_offset, unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
@@ -803,7 +839,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
#ifdef CONFIG_CIFS_STATS2
mid->when_received = jiffies;
#endif
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&mid->server->mid_lock);
if (!malformed)
mid->mid_state = MID_RESPONSE_RECEIVED;
else
@@ -813,12 +849,12 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
* function has finished processing it is a bug.
*/
if (mid->mid_flags & MID_DELETED) {
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&mid->server->mid_lock);
pr_warn_once("trying to dequeue a deleted mid\n");
} else {
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&mid->server->mid_lock);
}
}
@@ -830,7 +866,7 @@ smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
/*
* SMB1 does not use credits.
*/
- if (server->vals->header_preamble_size)
+ if (is_smb1(server))
return 0;
return le16_to_cpu(shdr->CreditRequest);
@@ -857,21 +893,68 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
dequeue_mid(mid, malformed);
}
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+ bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+ bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+ bool mnt_sign_enabled;
+
+ /*
+ * Is signing required by mnt options? If not then check
+ * global_secflags to see if it is there.
+ */
+ if (!mnt_sign_required)
+ mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+ CIFSSEC_MUST_SIGN);
+
+ /*
+ * If signing is required then it's automatically enabled too,
+ * otherwise, check to see if the secflags allow it.
+ */
+ mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+ (global_secflags & CIFSSEC_MAY_SIGN);
+
+ /* If server requires signing, does client allow it? */
+ if (srv_sign_required) {
+ if (!mnt_sign_enabled) {
+ cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!\n");
+ return -EOPNOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ /* If client requires signing, does server allow it? */
+ if (mnt_sign_required) {
+ if (!srv_sign_enabled) {
+ cifs_dbg(VFS, "Server does not support signing!\n");
+ return -EOPNOTSUPP;
+ }
+ server->sign = true;
+ }
+
+ if (cifs_rdma_enabled(server) && server->sign)
+ cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled\n");
+
+ return 0;
+}
+
+
static void clean_demultiplex_info(struct TCP_Server_Info *server)
{
int length;
/* take it off the list, if it's not already */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
list_del_init(&server->tcp_ses_list);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cancel_delayed_work_sync(&server->echo);
cancel_delayed_work_sync(&server->resolve);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
server->tcpStatus = CifsExiting;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
wake_up_all(&server->response_q);
/* check if we have blocked requests that need to free */
@@ -902,7 +985,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
struct list_head *tmp, *tmp2;
INIT_LIST_HEAD(&dispose_list);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Clearing mid %llu\n", mid_entry->mid);
@@ -911,7 +994,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
list_move(&mid_entry->qhead, &dispose_list);
mid_entry->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
/* now walk dispose list and issue callbacks */
list_for_each_safe(tmp, tmp2, &dispose_list) {
@@ -919,7 +1002,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
- cifs_mid_q_entry_release(mid_entry);
+ release_mid(mid_entry);
}
/* 1/8th of sec is more than enough time for them to exit */
msleep(125);
@@ -962,7 +1045,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* make sure this will fit in a large buffer */
if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) -
- server->vals->header_preamble_size) {
+ HEADER_PREAMBLE_SIZE(server)) {
cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
cifs_reconnect(server, true);
return -ECONNABORTED;
@@ -977,8 +1060,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
/* now read the rest */
length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
- pdu_length - HEADER_SIZE(server) + 1
- + server->vals->header_preamble_size);
+ pdu_length - MID_HEADER_SIZE(server));
if (length < 0)
return length;
@@ -993,19 +1075,18 @@ int
cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
- int length;
+ int rc;
/*
* We know that we received enough to get to the MID as we
* checked the pdu_length earlier. Now check to see
- * if the rest of the header is OK. We borrow the length
- * var for the rest of the loop to avoid a new stack var.
+ * if the rest of the header is OK.
*
* 48 bytes is enough to display the header and a little bit
* into the payload for debugging purposes.
*/
- length = server->ops->check_message(buf, server->total_read, server);
- if (length != 0)
+ rc = server->ops->check_message(buf, server->total_read, server);
+ if (rc)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
@@ -1020,9 +1101,9 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return -1;
if (!mid)
- return length;
+ return rc;
- handle_mid(mid, server, buf, length);
+ handle_mid(mid, server, buf, rc);
return 0;
}
@@ -1035,7 +1116,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
/*
* SMB1 does not use credits.
*/
- if (server->vals->header_preamble_size)
+ if (is_smb1(server))
return;
if (shdr->CreditRequest) {
@@ -1046,7 +1127,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_hdr_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_server_dbg(FYI, "%s: added %u credits total=%d\n",
@@ -1093,10 +1174,10 @@ cifs_demultiplex_thread(void *p)
if (length < 0)
continue;
- if (server->vals->header_preamble_size == 0)
- server->total_read = 0;
- else
+ if (is_smb1(server))
server->total_read = length;
+ else
+ server->total_read = 0;
/*
* The right amount was read from socket - 4 bytes,
@@ -1111,8 +1192,7 @@ next_pdu:
server->pdu_size = pdu_length;
/* make sure we have enough to get to the MID */
- if (server->pdu_size < HEADER_SIZE(server) - 1 -
- server->vals->header_preamble_size) {
+ if (server->pdu_size < MID_HEADER_SIZE(server)) {
cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n",
server->pdu_size);
cifs_reconnect(server, true);
@@ -1121,9 +1201,8 @@ next_pdu:
/* read down to the MID */
length = cifs_read_from_socket(server,
- buf + server->vals->header_preamble_size,
- HEADER_SIZE(server) - 1
- - server->vals->header_preamble_size);
+ buf + HEADER_PREAMBLE_SIZE(server),
+ MID_HEADER_SIZE(server));
if (length < 0)
continue;
server->total_read += length;
@@ -1159,7 +1238,7 @@ next_pdu:
if (length < 0) {
for (i = 0; i < num_mids; i++)
if (mids[i])
- cifs_mid_q_entry_release(mids[i]);
+ release_mid(mids[i]);
continue;
}
@@ -1186,7 +1265,7 @@ next_pdu:
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
- cifs_mid_q_entry_release(mids[i]);
+ release_mid(mids[i]);
} else if (server->ops->is_oplock_break &&
server->ops->is_oplock_break(bufs[i],
server)) {
@@ -1194,7 +1273,7 @@ next_pdu:
cifs_dbg(FYI, "Received oplock break\n");
} else {
cifs_server_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n",
- atomic_read(&midCount));
+ atomic_read(&mid_count));
cifs_dump_mem("Received Data is: ", bufs[i],
HEADER_SIZE(server));
smb2_add_credits_from_hdr(bufs[i], server);
@@ -1365,6 +1444,7 @@ match_security(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
return true;
}
+/* this function must be called with srv_lock held */
static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
@@ -1425,6 +1505,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ spin_lock(&server->srv_lock);
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
* DFS failover implementation in cifs_reconnect() requires unique tcp sessions for
@@ -1432,15 +1513,20 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
* shares or even links that may connect to same server but having completely
* different failover targets.
*/
- if (server->is_dfs_conn)
+ if (server->is_dfs_conn) {
+ spin_unlock(&server->srv_lock);
continue;
+ }
#endif
/*
* Skip ses channels since they're only handled in lower layers
* (e.g. cifs_send_recv).
*/
- if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx))
+ if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) {
+ spin_unlock(&server->srv_lock);
continue;
+ }
+ spin_unlock(&server->srv_lock);
++server->srv_count;
spin_unlock(&cifs_tcp_ses_lock);
@@ -1488,9 +1574,9 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
else
cancel_delayed_work_sync(&server->reconnect);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
server->tcpStatus = CifsExiting;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_crypto_secmech_release(server);
@@ -1549,13 +1635,13 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
if (primary_server) {
spin_lock(&cifs_tcp_ses_lock);
++primary_server->srv_count;
- tcp_ses->primary_server = primary_server;
spin_unlock(&cifs_tcp_ses_lock);
+ tcp_ses->primary_server = primary_server;
}
init_waitqueue_head(&tcp_ses->response_q);
init_waitqueue_head(&tcp_ses->request_q);
INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
- mutex_init(&tcp_ses->srv_mutex);
+ mutex_init(&tcp_ses->_srv_mutex);
memcpy(tcp_ses->workstation_RFC1001_name,
ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
memcpy(tcp_ses->server_RFC1001_name,
@@ -1566,6 +1652,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
tcp_ses->lstrp = jiffies;
tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression);
spin_lock_init(&tcp_ses->req_lock);
+ spin_lock_init(&tcp_ses->srv_lock);
+ spin_lock_init(&tcp_ses->mid_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
@@ -1639,9 +1727,9 @@ smbd_connected:
* to the struct since the kernel thread not created yet
* no need to spinlock this update of tcpStatus
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcp_ses->srv_lock);
tcp_ses->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcp_ses->srv_lock);
if ((ctx->max_credits < 20) || (ctx->max_credits > 60000))
tcp_ses->max_credits = SMB2_MAX_CREDITS_AVAILABLE;
@@ -1683,6 +1771,7 @@ out_err:
return ERR_PTR(rc);
}
+/* this function must be called with ses_lock held */
static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
{
if (ctx->sectype != Unspecified &&
@@ -1779,7 +1868,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
goto out;
}
- cifs_dbg(FYI, "IPC tcon rc = %d ipc tid = %d\n", rc, tcon->tid);
+ cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid);
ses->tcon_ipc = tcon;
out:
@@ -1818,10 +1907,17 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- if (ses->status == CifsExiting)
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
continue;
- if (!match_session(ses, ctx))
+ }
+ if (!match_session(ses, ctx)) {
+ spin_unlock(&ses->ses_lock);
continue;
+ }
+ spin_unlock(&ses->ses_lock);
+
++ses->ses_count;
spin_unlock(&cifs_tcp_ses_lock);
return ses;
@@ -1835,32 +1931,33 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
unsigned int rc, xid;
unsigned int chan_count;
struct TCP_Server_Info *server = ses->server;
- cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
- spin_lock(&cifs_tcp_ses_lock);
- if (ses->status == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
return;
}
+ spin_unlock(&ses->ses_lock);
cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE");
+ spin_lock(&cifs_tcp_ses_lock);
if (--ses->ses_count > 0) {
spin_unlock(&cifs_tcp_ses_lock);
return;
}
+ spin_unlock(&cifs_tcp_ses_lock);
/* ses_count can never go negative */
WARN_ON(ses->ses_count < 0);
- if (ses->status == CifsGood)
- ses->status = CifsExiting;
- spin_unlock(&cifs_tcp_ses_lock);
+ if (ses->ses_status == SES_GOOD)
+ ses->ses_status = SES_EXITING;
cifs_free_ipc(ses);
- if (ses->status == CifsExiting && server->ops->logoff) {
+ if (ses->ses_status == SES_EXITING && server->ops->logoff) {
xid = get_xid();
rc = server->ops->logoff(xid, ses);
if (rc)
@@ -1873,7 +1970,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- spin_lock(&ses->chan_lock);
chan_count = ses->chan_count;
/* close any extra channels */
@@ -1881,13 +1977,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
int i;
for (i = 1; i < chan_count; i++) {
- spin_unlock(&ses->chan_lock);
+ if (ses->chans[i].iface) {
+ kref_put(&ses->chans[i].iface->refcount, release_iface);
+ ses->chans[i].iface = NULL;
+ }
cifs_put_tcp_session(ses->chans[i].server, 0);
- spin_lock(&ses->chan_lock);
ses->chans[i].server = NULL;
}
}
- spin_unlock(&ses->chan_lock);
sesInfoFree(ses);
cifs_put_tcp_session(server, 0);
@@ -2027,18 +2124,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
}
- ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL);
- if (!ctx->workstation_name) {
- cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n");
- rc = -ENOMEM;
- kfree(ctx->username);
- ctx->username = NULL;
- kfree_sensitive(ctx->password);
- ctx->password = NULL;
- kfree(ctx->domainname);
- ctx->domainname = NULL;
- goto out_key_put;
- }
+ strscpy(ctx->workstation_name, ses->workstation_name, sizeof(ctx->workstation_name));
out_key_put:
up_read(&key->sem);
@@ -2080,7 +2166,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
ses = cifs_find_smb_ses(server, ctx);
if (ses) {
cifs_dbg(FYI, "Existing smb sess found (status=%d)\n",
- ses->status);
+ ses->ses_status);
spin_lock(&ses->chan_lock);
if (cifs_chan_needs_reconnect(ses, server)) {
@@ -2147,12 +2233,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
if (!ses->domainName)
goto get_ses_fail;
}
- if (ctx->workstation_name) {
- ses->workstation_name = kstrdup(ctx->workstation_name,
- GFP_KERNEL);
- if (!ses->workstation_name)
- goto get_ses_fail;
- }
+
+ strscpy(ses->workstation_name, ctx->workstation_name, sizeof(ses->workstation_name));
+
if (ctx->domainauto)
ses->domainAuto = ctx->domainauto;
ses->cred_uid = ctx->cred_uid;
@@ -2205,6 +2288,7 @@ get_ses_fail:
return ERR_PTR(rc);
}
+/* this function must be called with tc_lock held */
static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
if (tcon->status == TID_EXITING)
@@ -2227,16 +2311,17 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static struct cifs_tcon *
cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
{
- struct list_head *tmp;
struct cifs_tcon *tcon;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &ses->tcon_list) {
- tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
-
- if (!match_tcon(tcon, ctx))
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ spin_lock(&tcon->tc_lock);
+ if (!match_tcon(tcon, ctx)) {
+ spin_unlock(&tcon->tc_lock);
continue;
+ }
++tcon->tc_count;
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return tcon;
}
@@ -2260,7 +2345,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
ses = tcon->ses;
cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (--tcon->tc_count > 0) {
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
return;
}
@@ -2269,8 +2356,12 @@ cifs_put_tcon(struct cifs_tcon *tcon)
WARN_ON(tcon->tc_count < 0);
list_del_init(&tcon->tcon_list);
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
+ /* cancel polling of interfaces */
+ cancel_delayed_work_sync(&tcon->query_interfaces);
+
if (tcon->use_witness) {
int rc;
@@ -2499,6 +2590,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
*/
tcon->retry = ctx->retry;
tcon->nocase = ctx->nocase;
+ tcon->broken_sparse_sup = ctx->no_sparse;
if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)
tcon->nohandlecache = ctx->nohandlecache;
else
@@ -2507,6 +2599,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
+ /* schedule query interfaces poll */
+ INIT_DELAYED_WORK(&tcon->query_interfaces,
+ smb2_query_server_interfaces);
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -2578,6 +2676,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
return 0;
if (old->ctx->acdirmax != new->ctx->acdirmax)
return 0;
+ if (old->ctx->closetimeo != new->ctx->closetimeo)
+ return 0;
return 1;
}
@@ -2603,7 +2703,7 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
int
cifs_match_super(struct super_block *sb, void *data)
{
- struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data;
+ struct cifs_mnt_data *mnt_data = data;
struct smb3_fs_context *ctx;
struct cifs_sb_info *cifs_sb;
struct TCP_Server_Info *tcp_srv;
@@ -2626,6 +2726,9 @@ cifs_match_super(struct super_block *sb, void *data)
ctx = mnt_data->ctx;
+ spin_lock(&tcp_srv->srv_lock);
+ spin_lock(&ses->ses_lock);
+ spin_lock(&tcon->tc_lock);
if (!match_server(tcp_srv, ctx) ||
!match_session(ses, ctx) ||
!match_tcon(tcon, ctx) ||
@@ -2636,6 +2739,10 @@ cifs_match_super(struct super_block *sb, void *data)
rc = compare_mount_options(sb, mnt_data);
out:
+ spin_unlock(&tcon->tc_lock);
+ spin_unlock(&ses->ses_lock);
+ spin_unlock(&tcp_srv->srv_lock);
+
spin_unlock(&cifs_tcp_ses_lock);
cifs_put_tlink(tlink);
return rc;
@@ -2913,6 +3020,7 @@ ip_connect(struct TCP_Server_Info *server)
return generic_ip_connect(server);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
@@ -3018,6 +3126,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
}
}
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
{
@@ -3134,6 +3243,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
if (tcon->posix_extensions)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/* tell server which Unix caps we support */
if (cap_unix(tcon->ses)) {
/*
@@ -3141,16 +3251,17 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
* for just this mount.
*/
reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->ses->server->srv_lock);
if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
(le64_to_cpu(tcon->fsUnixInfo.Capability) &
CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->ses->server->srv_lock);
rc = -EACCES;
goto out;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->ses->server->srv_lock);
} else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
tcon->unix_ext = 0; /* server does not support them */
/* do not care if a following call succeed - informational */
@@ -3232,9 +3343,9 @@ static int mount_get_dfs_conns(struct mount_ctx *mnt_ctx)
rc = mount_get_conns(mnt_ctx);
if (mnt_ctx->server) {
cifs_dbg(FYI, "%s: marking tcp session as a dfs connection\n", __func__);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&mnt_ctx->server->srv_lock);
mnt_ctx->server->is_dfs_conn = true;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&mnt_ctx->server->srv_lock);
}
return rc;
}
@@ -3410,8 +3521,9 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
}
/*
- * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is,
- * otherwise 0.
+ * Check if path is remote (i.e. a DFS share).
+ *
+ * Return -EREMOTE if it is, otherwise 0 or -errno.
*/
static int is_path_remote(struct mount_ctx *mnt_ctx)
{
@@ -3422,6 +3534,9 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
struct cifs_tcon *tcon = mnt_ctx->tcon;
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
char *full_path;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ bool nodfs = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS;
+#endif
if (!server->ops->is_path_accessible)
return -EOPNOTSUPP;
@@ -3439,14 +3554,20 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
full_path);
#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (nodfs) {
+ if (rc == -EREMOTE)
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* path *might* exist with non-ASCII characters in DFS root
+ * try again with full path (only if nodfs is not set) */
if (rc == -ENOENT && is_tcon_dfs(tcon))
rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb,
full_path);
#endif
- if (rc != 0 && rc != -EREMOTE) {
- kfree(full_path);
- return rc;
- }
+ if (rc != 0 && rc != -EREMOTE)
+ goto out;
if (rc != -EREMOTE) {
rc = cifs_are_all_path_components_accessible(server, xid, tcon,
@@ -3458,6 +3579,7 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
}
}
+out:
kfree(full_path);
return rc;
}
@@ -3672,9 +3794,11 @@ static void setup_server_referral_paths(struct mount_ctx *mnt_ctx)
{
struct TCP_Server_Info *server = mnt_ctx->server;
+ mutex_lock(&server->refpath_lock);
server->origin_fullpath = mnt_ctx->origin_fullpath;
server->leaf_fullpath = mnt_ctx->leaf_fullpath;
server->current_fullpath = mnt_ctx->leaf_fullpath;
+ mutex_unlock(&server->refpath_lock);
mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL;
}
@@ -3691,6 +3815,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
if (!isdfs)
goto out;
+ /* proceed as DFS mount */
uuid_gen(&mnt_ctx.mount_id);
rc = connect_dfs_root(&mnt_ctx, &tl);
dfs_cache_free_tgts(&tl);
@@ -3864,7 +3989,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
}
bcc_ptr += length + 1;
bytes_left -= (length + 1);
- strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
@@ -3935,28 +4060,28 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
return -ENOSYS;
/* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (!server->ops->need_neg(server) ||
server->tcpStatus != CifsNeedNegotiate) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return 0;
}
server->tcpStatus = CifsInNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
if (rc == 0) {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
- server->tcpStatus = CifsNeedSessSetup;
+ server->tcpStatus = CifsGood;
else
rc = -EHOSTDOWN;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
server->tcpStatus = CifsNeedNegotiate;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
}
return rc;
@@ -3968,22 +4093,39 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+ struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
bool is_binding = false;
- /* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
- if ((server->tcpStatus != CifsNeedSessSetup) &&
- (ses->status == CifsGood)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ if (server->dstaddr.ss_family == AF_INET6)
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
+ else
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
+
+ if (ses->ses_status != SES_GOOD &&
+ ses->ses_status != SES_NEW &&
+ ses->ses_status != SES_NEED_RECON) {
+ spin_unlock(&ses->ses_lock);
return 0;
}
- server->tcpStatus = CifsInSessSetup;
- spin_unlock(&cifs_tcp_ses_lock);
+ /* only send once per connect */
spin_lock(&ses->chan_lock);
+ if (CIFS_ALL_CHANS_GOOD(ses) ||
+ cifs_chan_in_reconnect(ses, server)) {
+ spin_unlock(&ses->chan_lock);
+ spin_unlock(&ses->ses_lock);
+ return 0;
+ }
is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+ cifs_chan_set_in_reconnect(ses, server);
spin_unlock(&ses->chan_lock);
+ if (!is_binding)
+ ses->ses_status = SES_IN_SETUP;
+ spin_unlock(&ses->ses_lock);
+
if (!is_binding) {
ses->capabilities = server->capabilities;
if (!linuxExtEnabled)
@@ -4006,21 +4148,22 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (rc) {
cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc);
- spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsInSessSetup)
- server->tcpStatus = CifsNeedSessSetup;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_IN_SETUP)
+ ses->ses_status = SES_NEED_RECON;
+ spin_lock(&ses->chan_lock);
+ cifs_chan_clear_in_reconnect(ses, server);
+ spin_unlock(&ses->chan_lock);
+ spin_unlock(&ses->ses_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsInSessSetup)
- server->tcpStatus = CifsGood;
- /* Even if one channel is active, session is in good state */
- ses->status = CifsGood;
- spin_unlock(&cifs_tcp_ses_lock);
-
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_IN_SETUP)
+ ses->ses_status = SES_GOOD;
spin_lock(&ses->chan_lock);
+ cifs_chan_clear_in_reconnect(ses, server);
cifs_chan_clear_need_reconnect(ses, server);
spin_unlock(&ses->chan_lock);
+ spin_unlock(&ses->ses_lock);
}
return rc;
@@ -4094,8 +4237,10 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, ctx);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
out:
kfree(ctx->username);
@@ -4465,7 +4610,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco
*/
if (rc && server->current_fullpath != server->origin_fullpath) {
server->current_fullpath = server->origin_fullpath;
- cifs_reconnect(tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(server, true);
}
dfs_cache_free_tgts(tl);
@@ -4484,15 +4629,15 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
struct dfs_info3_param ref = {0};
/* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
- if (tcon->ses->status != CifsGood ||
+ spin_lock(&tcon->tc_lock);
+ if (tcon->ses->ses_status != SES_GOOD ||
(tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
return 0;
}
tcon->status = TID_IN_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
if (!tree) {
@@ -4531,15 +4676,15 @@ out:
cifs_put_tcp_super(sb);
if (rc) {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_NEED_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_GOOD;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
tcon->need_reconnect = false;
}
@@ -4552,28 +4697,28 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
const struct smb_version_operations *ops = tcon->ses->server->ops;
/* only send once per connect */
- spin_lock(&cifs_tcp_ses_lock);
- if (tcon->ses->status != CifsGood ||
+ spin_lock(&tcon->tc_lock);
+ if (tcon->ses->ses_status != SES_GOOD ||
(tcon->status != TID_NEW &&
tcon->status != TID_NEED_TCON)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
return 0;
}
tcon->status = TID_IN_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc);
if (rc) {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_NEED_TCON;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
} else {
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_GOOD;
- spin_unlock(&cifs_tcp_ses_lock);
tcon->need_reconnect = false;
+ spin_unlock(&tcon->tc_lock);
}
return rc;
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 30e040da4f09..a9b6c3eba6de 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -654,7 +654,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h
return ce;
}
}
- return ERR_PTR(-EEXIST);
+ return ERR_PTR(-ENOENT);
}
/*
@@ -662,7 +662,7 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h
*
* Use whole path components in the match. Must be called with htable_rw_lock held.
*
- * Return ERR_PTR(-EEXIST) if the entry is not found.
+ * Return ERR_PTR(-ENOENT) if the entry is not found.
*/
static struct cache_entry *lookup_cache_entry(const char *path)
{
@@ -710,7 +710,7 @@ static struct cache_entry *lookup_cache_entry(const char *path)
while (e > s && *e != sep)
e--;
}
- return ERR_PTR(-EEXIST);
+ return ERR_PTR(-ENOENT);
}
/**
@@ -1229,6 +1229,30 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id)
kref_put(&mg->refcount, mount_group_release);
}
+/* Extract share from DFS target and return a pointer to prefix path or NULL */
+static const char *parse_target_share(const char *target, char **share)
+{
+ const char *s, *seps = "/\\";
+ size_t len;
+
+ s = strpbrk(target + 1, seps);
+ if (!s)
+ return ERR_PTR(-EINVAL);
+
+ len = strcspn(s + 1, seps);
+ if (!len)
+ return ERR_PTR(-EINVAL);
+ s += len;
+
+ len = s - target + 1;
+ *share = kstrndup(target, len, GFP_KERNEL);
+ if (!*share)
+ return ERR_PTR(-ENOMEM);
+
+ s = target + len;
+ return s + strspn(s, seps);
+}
+
/**
* dfs_cache_get_tgt_share - parse a DFS target
*
@@ -1242,56 +1266,46 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id)
int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share,
char **prefix)
{
- char *s, sep, *p;
- size_t len;
- size_t plen1, plen2;
+ char sep;
+ char *target_share;
+ char *ppath = NULL;
+ const char *target_ppath, *dfsref_ppath;
+ size_t target_pplen, dfsref_pplen;
+ size_t len, c;
if (!it || !path || !share || !prefix || strlen(path) < it->it_path_consumed)
return -EINVAL;
- *share = NULL;
- *prefix = NULL;
-
sep = it->it_name[0];
if (sep != '\\' && sep != '/')
return -EINVAL;
- s = strchr(it->it_name + 1, sep);
- if (!s)
- return -EINVAL;
+ target_ppath = parse_target_share(it->it_name, &target_share);
+ if (IS_ERR(target_ppath))
+ return PTR_ERR(target_ppath);
- /* point to prefix in target node */
- s = strchrnul(s + 1, sep);
+ /* point to prefix in DFS referral path */
+ dfsref_ppath = path + it->it_path_consumed;
+ dfsref_ppath += strspn(dfsref_ppath, "/\\");
- /* extract target share */
- *share = kstrndup(it->it_name, s - it->it_name, GFP_KERNEL);
- if (!*share)
- return -ENOMEM;
+ target_pplen = strlen(target_ppath);
+ dfsref_pplen = strlen(dfsref_ppath);
- /* skip separator */
- if (*s)
- s++;
- /* point to prefix in DFS path */
- p = path + it->it_path_consumed;
- if (*p == sep)
- p++;
-
- /* merge prefix paths from DFS path and target node */
- plen1 = it->it_name + strlen(it->it_name) - s;
- plen2 = path + strlen(path) - p;
- if (plen1 || plen2) {
- len = plen1 + plen2 + 2;
- *prefix = kmalloc(len, GFP_KERNEL);
- if (!*prefix) {
- kfree(*share);
- *share = NULL;
+ /* merge prefix paths from DFS referral path and target node */
+ if (target_pplen || dfsref_pplen) {
+ len = target_pplen + dfsref_pplen + 2;
+ ppath = kzalloc(len, GFP_KERNEL);
+ if (!ppath) {
+ kfree(target_share);
return -ENOMEM;
}
- if (plen1)
- scnprintf(*prefix, len, "%.*s%c%.*s", (int)plen1, s, sep, (int)plen2, p);
- else
- strscpy(*prefix, p, len);
+ c = strscpy(ppath, target_ppath, len);
+ if (c && dfsref_pplen)
+ ppath[c] = sep;
+ strlcat(ppath, dfsref_ppath, len);
}
+ *share = target_share;
+ *prefix = ppath;
return 0;
}
@@ -1327,9 +1341,9 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n",
__func__, ip);
} else {
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
}
kfree(ip);
@@ -1422,12 +1436,14 @@ static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool
struct TCP_Server_Info *server = tcon->ses->server;
mutex_lock(&server->refpath_lock);
- if (strcasecmp(server->leaf_fullpath, server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
+ }
mutex_unlock(&server->refpath_lock);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
-
return 0;
}
@@ -1510,15 +1526,21 @@ static void refresh_mounts(struct cifs_ses **sessions)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- if (!server->is_dfs_conn)
+ spin_lock(&server->srv_lock);
+ if (!server->is_dfs_conn) {
+ spin_unlock(&server->srv_lock);
continue;
+ }
+ spin_unlock(&server->srv_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ spin_lock(&tcon->tc_lock);
if (!tcon->ipc && !tcon->need_reconnect) {
tcon->tc_count++;
list_add_tail(&tcon->ulist, &tcons);
}
+ spin_unlock(&tcon->tc_lock);
}
}
}
@@ -1530,11 +1552,14 @@ static void refresh_mounts(struct cifs_ses **sessions)
list_del_init(&tcon->ulist);
mutex_lock(&server->refpath_lock);
- if (strcasecmp(server->leaf_fullpath, server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
+ if (server->origin_fullpath) {
+ if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
+ server->origin_fullpath))
+ __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
+ __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
+ }
mutex_unlock(&server->refpath_lock);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
cifs_put_tcon(tcon);
}
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index ce9b22aecfba..08f7392716e2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -193,6 +193,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
return PTR_ERR(full_path);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -261,6 +262,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
* rare for path not covered on files)
*/
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
desired_access = 0;
if (OPEN_FMODE(oflags) & FMODE_READ)
@@ -316,6 +318,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
goto out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* If Open reported that we actually created a file then we now have to
* set the mode if possible.
@@ -357,6 +360,9 @@ cifs_create_get_file_info:
rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
xid);
else {
+#else
+ {
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* TODO: Add support for calling POSIX query info here, but passing in fid */
rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
xid, fid);
@@ -377,7 +383,9 @@ cifs_create_get_file_info:
}
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
cifs_create_set_dentry:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (rc != 0) {
cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
rc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d511a78383c3..6f38b134a346 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -26,6 +26,7 @@
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
+#include "smb2proto.h"
#include "cifs_unicode.h"
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
@@ -33,6 +34,48 @@
#include "smbdirect.h"
#include "fs_context.h"
#include "cifs_ioctl.h"
+#include "cached_dir.h"
+
+/*
+ * Mark as invalid, all open files on tree connections since they
+ * were closed when session to server was lost.
+ */
+void
+cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
+{
+ struct cifsFileInfo *open_file = NULL;
+ struct list_head *tmp;
+ struct list_head *tmp1;
+
+ /* only send once per connect */
+ spin_lock(&tcon->ses->ses_lock);
+ if ((tcon->ses->ses_status != SES_GOOD) || (tcon->status != TID_NEED_RECON)) {
+ spin_unlock(&tcon->ses->ses_lock);
+ return;
+ }
+ tcon->status = TID_IN_FILES_INVALIDATE;
+ spin_unlock(&tcon->ses->ses_lock);
+
+ /* list all files open on tree connection and mark them invalid */
+ spin_lock(&tcon->open_file_lock);
+ list_for_each_safe(tmp, tmp1, &tcon->openFileList) {
+ open_file = list_entry(tmp, struct cifsFileInfo, tlist);
+ open_file->invalidHandle = true;
+ open_file->oplock_break_cancelled = true;
+ }
+ spin_unlock(&tcon->open_file_lock);
+
+ invalidate_all_cached_dirs(tcon);
+ spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_IN_FILES_INVALIDATE)
+ tcon->status = TID_NEED_TCON;
+ spin_unlock(&tcon->tc_lock);
+
+ /*
+ * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+ * to this tcon.
+ */
+}
static inline int cifs_convert_flags(unsigned int flags)
{
@@ -52,6 +95,7 @@ static inline int cifs_convert_flags(unsigned int flags)
FILE_READ_DATA);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static u32 cifs_posix_convert_flags(unsigned int flags)
{
u32 posix_flags = 0;
@@ -85,6 +129,7 @@ static u32 cifs_posix_convert_flags(unsigned int flags)
return posix_flags;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static inline int cifs_get_disposition(unsigned int flags)
{
@@ -100,6 +145,7 @@ static inline int cifs_get_disposition(unsigned int flags)
return FILE_OPEN;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
int cifs_posix_open(const char *full_path, struct inode **pinode,
struct super_block *sb, int mode, unsigned int f_flags,
__u32 *poplock, __u16 *pnetfid, unsigned int xid)
@@ -161,6 +207,7 @@ posix_open_ret:
kfree(presp_data);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
@@ -579,6 +626,7 @@ int cifs_open(struct inode *inode, struct file *file)
else
oplock = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (!tcon->broken_posix_open && tcon->unix_ext &&
cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -603,6 +651,7 @@ int cifs_open(struct inode *inode, struct file *file)
* or DFS errors.
*/
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &fid);
@@ -630,6 +679,7 @@ int cifs_open(struct inode *inode, struct file *file)
goto out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) {
/*
* Time to set mode which we can not set earlier due to
@@ -647,6 +697,7 @@ int cifs_open(struct inode *inode, struct file *file)
CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid.netfid,
cfile->pid);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
use_cache:
fscache_use_cookie(cifs_inode_cookie(file_inode(file)),
@@ -664,7 +715,9 @@ out:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/*
* Try to reacquire byte range locks that were released when session
@@ -673,10 +726,12 @@ static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
static int
cifs_relock_file(struct cifsFileInfo *cfile)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING);
if (cinode->can_cache_brlcks) {
@@ -685,11 +740,13 @@ cifs_relock_file(struct cifsFileInfo *cfile)
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
rc = cifs_push_posix_locks(cfile);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = tcon->ses->server->ops->push_mand_locks(cfile);
up_read(&cinode->lock_sem);
@@ -750,6 +807,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
else
oplock = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext && cap_unix(tcon->ses) &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
@@ -773,6 +831,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
* in the reconnect path it is important to retry hard
*/
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
desired_access = cifs_convert_flags(cfile->f_flags);
@@ -817,7 +876,9 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
goto reopen_error_exit;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
reopen_success:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
cfile->invalidHandle = false;
mutex_unlock(&cfile->fh_mutex);
cinode = CIFS_I(inode);
@@ -903,12 +964,12 @@ int cifs_close(struct inode *inode, struct file *file)
* So, Increase the ref count to avoid use-after-free.
*/
if (!mod_delayed_work(deferredclose_wq,
- &cfile->deferred, cifs_sb->ctx->acregmax))
+ &cfile->deferred, cifs_sb->ctx->closetimeo))
cifsFileInfo_get(cfile);
} else {
/* Deferred close for files */
queue_delayed_work(deferredclose_wq,
- &cfile->deferred, cifs_sb->ctx->acregmax);
+ &cfile->deferred, cifs_sb->ctx->closetimeo);
cfile->deferred_close_scheduled = true;
spin_unlock(&cinode->deferred_lock);
return 0;
@@ -928,9 +989,7 @@ int cifs_close(struct inode *inode, struct file *file)
void
cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
{
- struct cifsFileInfo *open_file;
- struct list_head *tmp;
- struct list_head *tmp1;
+ struct cifsFileInfo *open_file, *tmp;
struct list_head tmp_list;
if (!tcon->use_persistent || !tcon->need_reopen_files)
@@ -943,8 +1002,7 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
/* list all files open on tree connection, reopen resilient handles */
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- open_file = list_entry(tmp, struct cifsFileInfo, tlist);
+ list_for_each_entry(open_file, &tcon->openFileList, tlist) {
if (!open_file->invalidHandle)
continue;
cifsFileInfo_get(open_file);
@@ -952,8 +1010,7 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
}
spin_unlock(&tcon->open_file_lock);
- list_for_each_safe(tmp, tmp1, &tmp_list) {
- open_file = list_entry(tmp, struct cifsFileInfo, rlist);
+ list_for_each_entry_safe(open_file, tmp, &tmp_list, rlist) {
if (cifs_reopen_file(open_file, false /* do not flush */))
tcon->need_reopen_files = true;
list_del_init(&open_file->rlist);
@@ -1196,6 +1253,7 @@ try_again:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* Check if there is another lock that prevents us to set the lock (posix
* style). If such a lock exists, update the flock structure with its
@@ -1334,6 +1392,7 @@ hash_lockowner(fl_owner_t owner)
{
return cifs_lock_secret ^ hash32_ptr((const void *)owner);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
struct lock_to_push {
struct list_head llist;
@@ -1344,6 +1403,7 @@ struct lock_to_push {
__u8 type;
};
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
@@ -1395,7 +1455,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
cifs_dbg(VFS, "Can't push all brlocks!\n");
break;
}
- length = 1 + flock->fl_end - flock->fl_start;
+ length = cifs_flock_len(flock);
if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
type = CIFS_RDLCK;
else
@@ -1431,14 +1491,17 @@ err_out:
}
goto out;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_push_locks(struct cifsFileInfo *cfile)
{
- struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* we are going to update can_cache_brlcks here - need a write access */
cifs_down_write(&cinode->lock_sem);
@@ -1447,11 +1510,13 @@ cifs_push_locks(struct cifsFileInfo *cfile)
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
rc = cifs_push_posix_locks(cfile);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = tcon->ses->server->ops->push_mand_locks(cfile);
cinode->can_cache_brlcks = false;
@@ -1511,10 +1576,11 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
bool wait_flag, bool posix_lck, unsigned int xid)
{
int rc = 0;
- __u64 length = 1 + flock->fl_end - flock->fl_start;
+ __u64 length = cifs_flock_len(flock);
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
__u16 netfid = cfile->fid.netfid;
if (posix_lck) {
@@ -1534,6 +1600,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
posix_lock_type, wait_flag);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = cifs_lock_test(cfile, flock->fl_start, length, type, flock);
if (!rc)
@@ -1594,6 +1661,7 @@ cifs_free_llist(struct list_head *llist)
}
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
int
cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
unsigned int xid)
@@ -1609,7 +1677,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifsLockInfo *li, *tmp;
- __u64 length = 1 + flock->fl_end - flock->fl_start;
+ __u64 length = cifs_flock_len(flock);
struct list_head tmp_llist;
INIT_LIST_HEAD(&tmp_llist);
@@ -1706,6 +1774,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
kfree(buf);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
@@ -1713,12 +1782,13 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
unsigned int xid)
{
int rc = 0;
- __u64 length = 1 + flock->fl_end - flock->fl_start;
+ __u64 length = cifs_flock_len(flock);
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct inode *inode = d_inode(cfile->dentry);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (posix_lck) {
int posix_lock_type;
@@ -1740,7 +1810,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
NULL, posix_lock_type, wait_flag);
goto out;
}
-
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (lock) {
struct cifsLockInfo *lock;
@@ -1861,9 +1931,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
rc = -EACCES;
xid = get_xid();
- cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n",
- cmd, flock->fl_flags, flock->fl_type,
- flock->fl_start, flock->fl_end);
+ cifs_dbg(FYI, "%s: %pD2 cmd=0x%x type=0x%x flags=0x%x r=%lld:%lld\n", __func__, file, cmd,
+ flock->fl_flags, flock->fl_type, (long long)flock->fl_start,
+ (long long)flock->fl_end);
cfile = (struct cifsFileInfo *)file->private_data;
tcon = tlink_tcon(cfile->tlink);
@@ -2004,7 +2074,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
bool fsuid_only)
{
struct cifsFileInfo *open_file = NULL;
- struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
@@ -2060,7 +2130,7 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags,
return rc;
}
- cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
@@ -2204,6 +2274,185 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
return -ENOENT;
}
+void
+cifs_writedata_release(struct kref *refcount)
+{
+ struct cifs_writedata *wdata = container_of(refcount,
+ struct cifs_writedata, refcount);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (wdata->mr) {
+ smbd_deregister_mr(wdata->mr);
+ wdata->mr = NULL;
+ }
+#endif
+
+ if (wdata->cfile)
+ cifsFileInfo_put(wdata->cfile);
+
+ kvfree(wdata->pages);
+ kfree(wdata);
+}
+
+/*
+ * Write failed with a retryable error. Resend the write request. It's also
+ * possible that the page was redirtied so re-clean the page.
+ */
+static void
+cifs_writev_requeue(struct cifs_writedata *wdata)
+{
+ int i, rc = 0;
+ struct inode *inode = d_inode(wdata->cfile->dentry);
+ struct TCP_Server_Info *server;
+ unsigned int rest_len;
+
+ server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+ i = 0;
+ rest_len = wdata->bytes;
+ do {
+ struct cifs_writedata *wdata2;
+ unsigned int j, nr_pages, wsize, tailsz, cur_len;
+
+ wsize = server->ops->wp_retry_size(inode);
+ if (wsize < rest_len) {
+ nr_pages = wsize / PAGE_SIZE;
+ if (!nr_pages) {
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ cur_len = nr_pages * PAGE_SIZE;
+ tailsz = PAGE_SIZE;
+ } else {
+ nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
+ cur_len = rest_len;
+ tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
+ }
+
+ wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
+ if (!wdata2) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ for (j = 0; j < nr_pages; j++) {
+ wdata2->pages[j] = wdata->pages[i + j];
+ lock_page(wdata2->pages[j]);
+ clear_page_dirty_for_io(wdata2->pages[j]);
+ }
+
+ wdata2->sync_mode = wdata->sync_mode;
+ wdata2->nr_pages = nr_pages;
+ wdata2->offset = page_offset(wdata2->pages[0]);
+ wdata2->pagesz = PAGE_SIZE;
+ wdata2->tailsz = tailsz;
+ wdata2->bytes = cur_len;
+
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
+ &wdata2->cfile);
+ if (!wdata2->cfile) {
+ cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
+ rc);
+ if (!is_retryable_error(rc))
+ rc = -EBADF;
+ } else {
+ wdata2->pid = wdata2->cfile->pid;
+ rc = server->ops->async_writev(wdata2,
+ cifs_writedata_release);
+ }
+
+ for (j = 0; j < nr_pages; j++) {
+ unlock_page(wdata2->pages[j]);
+ if (rc != 0 && !is_retryable_error(rc)) {
+ SetPageError(wdata2->pages[j]);
+ end_page_writeback(wdata2->pages[j]);
+ put_page(wdata2->pages[j]);
+ }
+ }
+
+ kref_put(&wdata2->refcount, cifs_writedata_release);
+ if (rc) {
+ if (is_retryable_error(rc))
+ continue;
+ i += nr_pages;
+ break;
+ }
+
+ rest_len -= cur_len;
+ i += nr_pages;
+ } while (i < wdata->nr_pages);
+
+ /* cleanup remaining pages from the original wdata */
+ for (; i < wdata->nr_pages; i++) {
+ SetPageError(wdata->pages[i]);
+ end_page_writeback(wdata->pages[i]);
+ put_page(wdata->pages[i]);
+ }
+
+ if (rc != 0 && !is_retryable_error(rc))
+ mapping_set_error(inode->i_mapping, rc);
+ kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+void
+cifs_writev_complete(struct work_struct *work)
+{
+ struct cifs_writedata *wdata = container_of(work,
+ struct cifs_writedata, work);
+ struct inode *inode = d_inode(wdata->cfile->dentry);
+ int i = 0;
+
+ if (wdata->result == 0) {
+ spin_lock(&inode->i_lock);
+ cifs_update_eof(CIFS_I(inode), wdata->offset, wdata->bytes);
+ spin_unlock(&inode->i_lock);
+ cifs_stats_bytes_written(tlink_tcon(wdata->cfile->tlink),
+ wdata->bytes);
+ } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
+ return cifs_writev_requeue(wdata);
+
+ for (i = 0; i < wdata->nr_pages; i++) {
+ struct page *page = wdata->pages[i];
+
+ if (wdata->result == -EAGAIN)
+ __set_page_dirty_nobuffers(page);
+ else if (wdata->result < 0)
+ SetPageError(page);
+ end_page_writeback(page);
+ cifs_readpage_to_fscache(inode, page);
+ put_page(page);
+ }
+ if (wdata->result != -EAGAIN)
+ mapping_set_error(inode->i_mapping, wdata->result);
+ kref_put(&wdata->refcount, cifs_writedata_release);
+}
+
+struct cifs_writedata *
+cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
+{
+ struct page **pages =
+ kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (pages)
+ return cifs_writedata_direct_alloc(pages, complete);
+
+ return NULL;
+}
+
+struct cifs_writedata *
+cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
+{
+ struct cifs_writedata *wdata;
+
+ wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
+ if (wdata != NULL) {
+ wdata->pages = pages;
+ kref_init(&wdata->refcount);
+ INIT_LIST_HEAD(&wdata->list);
+ init_completion(&wdata->done);
+ INIT_WORK(&wdata->work, complete);
+ }
+ return wdata;
+}
+
+
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
{
struct address_space *mapping = page->mapping;
@@ -2777,8 +3026,11 @@ int cifs_flush(struct file *file, fl_owner_t id)
rc = filemap_write_and_wait(inode->i_mapping);
cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc);
- if (rc)
+ if (rc) {
+ /* get more nuanced writeback errors */
+ rc = filemap_check_wb_err(file->f_mapping, 0);
trace_cifs_flush_err(inode->i_ino, rc);
+ }
return rc;
}
@@ -3019,7 +3271,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
if (ctx->direct_io) {
ssize_t result;
- result = iov_iter_get_pages_alloc(
+ result = iov_iter_get_pages_alloc2(
from, &pagevec, cur_len, &start);
if (result < 0) {
cifs_dbg(VFS,
@@ -3033,7 +3285,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
break;
}
cur_len = (size_t)result;
- iov_iter_advance(from, cur_len);
nr_pages =
(cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
@@ -3324,6 +3575,9 @@ static ssize_t __cifs_writev(
ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
+
+ cifs_revalidate_mapping(file->f_inode);
return __cifs_writev(iocb, from, true);
}
@@ -3755,7 +4009,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
if (ctx->direct_io) {
ssize_t result;
- result = iov_iter_get_pages_alloc(
+ result = iov_iter_get_pages_alloc2(
&direct_iov, &pagevec,
cur_len, &start);
if (result < 0) {
@@ -3771,7 +4025,6 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
break;
}
cur_len = (size_t)result;
- iov_iter_advance(&direct_iov, cur_len);
rdata = cifs_readdata_direct_alloc(
pagevec, cifs_uncached_readv_complete);
@@ -4001,7 +4254,7 @@ static ssize_t __cifs_readv(
if (!is_sync_kiocb(iocb))
ctx->iocb = iocb;
- if (iter_is_iovec(to))
+ if (user_backed_iter(to))
ctx->should_dirty = true;
if (direct) {
@@ -4456,10 +4709,11 @@ static void cifs_readahead(struct readahead_control *ractl)
* TODO: Send a whole batch of pages to be read
* by the cache.
*/
- page = readahead_page(ractl);
- last_batch_size = 1 << thp_order(page);
+ struct folio *folio = readahead_folio(ractl);
+
+ last_batch_size = folio_nr_pages(folio);
if (cifs_readpage_from_fscache(ractl->mapping->host,
- page) < 0) {
+ &folio->page) < 0) {
/*
* TODO: Deal with cache read failure
* here, but for the moment, delegate
@@ -4467,7 +4721,7 @@ static void cifs_readahead(struct readahead_control *ractl)
*/
caching = false;
}
- unlock_page(page);
+ folio_unlock(folio);
next_cached++;
cache_nr_pages--;
if (cache_nr_pages == 0)
@@ -4612,8 +4866,9 @@ read_complete:
return rc;
}
-static int cifs_readpage(struct file *file, struct page *page)
+static int cifs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
loff_t offset = page_file_offset(page);
int rc = -EACCES;
unsigned int xid;
@@ -4626,7 +4881,7 @@ static int cifs_readpage(struct file *file, struct page *page)
return rc;
}
- cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n",
+ cifs_dbg(FYI, "read_folio %p at offset %d 0x%x\n",
page, (int)offset, (int)offset);
rc = cifs_readpage_worker(file, page, &offset);
@@ -4665,14 +4920,14 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
/* This inode is open for write at least once */
struct cifs_sb_info *cifs_sb;
- cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
+ cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
/* since no page cache to corrupt on directio
we can change size safely */
return true;
}
- if (i_size_read(&cifsInode->vfs_inode) < end_of_file)
+ if (i_size_read(&cifsInode->netfs.inode) < end_of_file)
return true;
return false;
@@ -4681,7 +4936,7 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
}
static int cifs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int oncethru = 0;
@@ -4695,7 +4950,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
start:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page) {
rc = -ENOMEM;
goto out;
@@ -4757,16 +5012,16 @@ out:
return rc;
}
-static int cifs_release_page(struct page *page, gfp_t gfp)
+static bool cifs_release_folio(struct folio *folio, gfp_t gfp)
{
- if (PagePrivate(page))
+ if (folio_test_private(folio))
return 0;
- if (PageFsCache(page)) {
+ if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
- fscache_note_page_release(cifs_inode_cookie(page->mapping->host));
+ fscache_note_page_release(cifs_inode_cookie(folio->mapping->host));
return true;
}
@@ -4807,8 +5062,6 @@ void cifs_oplock_break(struct work_struct *work)
struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
bool purge_cache = false;
- bool is_deferred = false;
- struct cifs_deferred_close *dclose;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
@@ -4845,22 +5098,6 @@ void cifs_oplock_break(struct work_struct *work)
oplock_break_ack:
/*
- * When oplock break is received and there are no active
- * file handles but cached, then schedule deferred close immediately.
- * So, new open will not use cached handle.
- */
- spin_lock(&CIFS_I(inode)->deferred_lock);
- is_deferred = cifs_is_deferred_close(cfile, &dclose);
- spin_unlock(&CIFS_I(inode)->deferred_lock);
- if (is_deferred &&
- cfile->deferred_close_scheduled &&
- delayed_work_pending(&cfile->deferred)) {
- if (cancel_delayed_work(&cfile->deferred)) {
- _cifsFileInfo_put(cfile, false, false);
- goto oplock_break_done;
- }
- }
- /*
* releasing stale oplock after recent reconnect of smb session using
* a now incorrect file handle is not a data integrity issue but do
* not bother sending an oplock release if session to server still is
@@ -4871,7 +5108,7 @@ oplock_break_ack:
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
-oplock_break_done:
+
_cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}
@@ -4905,6 +5142,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
cifs_dbg(FYI, "swap activate\n");
+ if (!swap_file->f_mapping->a_ops->swap_rw)
+ /* Cannot support swap */
+ return -EINVAL;
+
spin_lock(&inode->i_lock);
blocks = inode->i_blocks;
isize = inode->i_size;
@@ -4933,7 +5174,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
* from reading or writing the file
*/
- return 0;
+ sis->flags |= SWP_FS_OPS;
+ return add_swap_extent(sis, 0, sis->max, 0);
}
static void cifs_swap_deactivate(struct file *file)
@@ -4965,14 +5207,14 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
#endif
const struct address_space_operations cifs_addr_ops = {
- .readpage = cifs_readpage,
+ .read_folio = cifs_read_folio,
.readahead = cifs_readahead,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
.dirty_folio = cifs_dirty_folio,
- .releasepage = cifs_release_page,
+ .release_folio = cifs_release_folio,
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
@@ -4986,18 +5228,18 @@ const struct address_space_operations cifs_addr_ops = {
};
/*
- * cifs_readpages requires the server to support a buffer large enough to
+ * cifs_readahead requires the server to support a buffer large enough to
* contain the header plus one complete page of data. Otherwise, we need
- * to leave cifs_readpages out of the address space operations.
+ * to leave cifs_readahead out of the address space operations.
*/
const struct address_space_operations cifs_addr_ops_smallbuf = {
- .readpage = cifs_readpage,
+ .read_folio = cifs_read_folio,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
.dirty_folio = cifs_dirty_folio,
- .releasepage = cifs_release_page,
+ .release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
};
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index a92e9eec521f..0e13dec86b25 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -119,6 +119,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag_no("persistenthandles", Opt_persistent),
fsparam_flag_no("resilienthandles", Opt_resilient),
fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay),
+ fsparam_flag("nosparse", Opt_nosparse),
fsparam_flag("domainauto", Opt_domainauto),
fsparam_flag("rdma", Opt_rdma),
fsparam_flag("modesid", Opt_modesid),
@@ -146,6 +147,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("actimeo", Opt_actimeo),
fsparam_u32("acdirmax", Opt_acdirmax),
fsparam_u32("acregmax", Opt_acregmax),
+ fsparam_u32("closetimeo", Opt_closetimeo),
fsparam_u32("echo_interval", Opt_echo_interval),
fsparam_u32("max_credits", Opt_max_credits),
fsparam_u32("handletimeout", Opt_handletimeout),
@@ -312,7 +314,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->password = NULL;
new_ctx->server_hostname = NULL;
new_ctx->domainname = NULL;
- new_ctx->workstation_name = NULL;
new_ctx->UNC = NULL;
new_ctx->source = NULL;
new_ctx->iocharset = NULL;
@@ -327,7 +328,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(UNC);
DUP_CTX_STR(source);
DUP_CTX_STR(domainname);
- DUP_CTX_STR(workstation_name);
DUP_CTX_STR(nodename);
DUP_CTX_STR(iocharset);
@@ -766,8 +766,7 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
cifs_errorf(fc, "can not change domainname during remount\n");
return -EINVAL;
}
- if (new_ctx->workstation_name &&
- (!old_ctx->workstation_name || strcmp(new_ctx->workstation_name, old_ctx->workstation_name))) {
+ if (strcmp(new_ctx->workstation_name, old_ctx->workstation_name)) {
cifs_errorf(fc, "can not change workstation_name during remount\n");
return -EINVAL;
}
@@ -814,7 +813,6 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, username);
STEAL_STRING(cifs_sb, ctx, password);
STEAL_STRING(cifs_sb, ctx, domainname);
- STEAL_STRING(cifs_sb, ctx, workstation_name);
STEAL_STRING(cifs_sb, ctx, nodename);
STEAL_STRING(cifs_sb, ctx, iocharset);
@@ -943,6 +941,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_nolease:
ctx->no_lease = 1;
break;
+ case Opt_nosparse:
+ ctx->no_sparse = 1;
+ break;
case Opt_nodelete:
ctx->nodelete = 1;
break;
@@ -1074,6 +1075,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
ctx->acdirmax = ctx->acregmax = HZ * result.uint_32;
break;
+ case Opt_closetimeo:
+ ctx->closetimeo = HZ * result.uint_32;
+ if (ctx->closetimeo > SMB3_MAX_DCLOSETIMEO) {
+ cifs_errorf(fc, "closetimeo too large\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
case Opt_echo_interval:
ctx->echo_interval = result.uint_32;
break;
@@ -1467,22 +1475,15 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
int smb3_init_fs_context(struct fs_context *fc)
{
- int rc;
struct smb3_fs_context *ctx;
char *nodename = utsname()->nodename;
int i;
ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL);
- if (unlikely(!ctx)) {
- rc = -ENOMEM;
- goto err_exit;
- }
+ if (unlikely(!ctx))
+ return -ENOMEM;
- ctx->workstation_name = kstrdup(nodename, GFP_KERNEL);
- if (unlikely(!ctx->workstation_name)) {
- rc = -ENOMEM;
- goto err_exit;
- }
+ strscpy(ctx->workstation_name, nodename, sizeof(ctx->workstation_name));
/*
* does not have to be perfect mapping since field is
@@ -1528,6 +1529,7 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->acregmax = CIFS_DEF_ACTIMEO;
ctx->acdirmax = CIFS_DEF_ACTIMEO;
+ ctx->closetimeo = SMB3_DEF_DCLOSETIMEO;
/* Most clients set timeout to 0, allows server to use its default */
ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */
@@ -1555,14 +1557,6 @@ int smb3_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &smb3_fs_context_ops;
return 0;
-
-err_exit:
- if (ctx) {
- kfree(ctx->workstation_name);
- kfree(ctx);
- }
-
- return rc;
}
void
@@ -1588,8 +1582,6 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->source = NULL;
kfree(ctx->domainname);
ctx->domainname = NULL;
- kfree(ctx->workstation_name);
- ctx->workstation_name = NULL;
kfree(ctx->nodename);
ctx->nodename = NULL;
kfree(ctx->iocharset);
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index e54090d9ef36..bbaee4c2281f 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -62,6 +62,7 @@ enum cifs_param {
Opt_noblocksend,
Opt_noautotune,
Opt_nolease,
+ Opt_nosparse,
Opt_hard,
Opt_soft,
Opt_perm,
@@ -124,6 +125,7 @@ enum cifs_param {
Opt_actimeo,
Opt_acdirmax,
Opt_acregmax,
+ Opt_closetimeo,
Opt_echo_interval,
Opt_max_credits,
Opt_snapshot,
@@ -170,7 +172,7 @@ struct smb3_fs_context {
char *server_hostname;
char *UNC;
char *nodename;
- char *workstation_name;
+ char workstation_name[CIFS_MAX_WORKSTATION_LEN];
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -222,6 +224,7 @@ struct smb3_fs_context {
bool noautotune:1;
bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
bool no_lease:1; /* disable requesting leases */
+ bool no_sparse:1; /* do not attempt to set files sparse */
bool fsc:1; /* enable fscache */
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
@@ -245,6 +248,8 @@ struct smb3_fs_context {
/* attribute cache timemout for files and directories in jiffies */
unsigned long acregmax;
unsigned long acdirmax;
+ /* timeout for deferred close of files in jiffies */
+ unsigned long closetimeo;
struct smb_version_operations *ops;
struct smb_version_values *vals;
char *prepath;
@@ -277,4 +282,9 @@ static inline struct smb3_fs_context *smb3_fc2context(const struct fs_context *f
extern int smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx);
extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb);
+/*
+ * max deferred close timeout (jiffies) - 2^30
+ */
+#define SMB3_MAX_DCLOSETIMEO (1 << 30)
+#define SMB3_DEF_DCLOSETIMEO (5 * HZ) /* Can increase later, other clients use larger */
#endif
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a638b29e9062..23ef56f55ce5 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -101,13 +101,13 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd);
+ cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd);
- cifsi->netfs_ctx.cache =
+ cifsi->netfs.cache =
fscache_acquire_cookie(tcon->fscache, 0,
&cifsi->uniqueid, sizeof(cifsi->uniqueid),
&cd, sizeof(cd),
- i_size_read(&cifsi->vfs_inode));
+ i_size_read(&cifsi->netfs.inode));
}
void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
@@ -131,7 +131,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
if (cookie) {
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie);
fscache_relinquish_cookie(cookie, false);
- cifsi->netfs_ctx.cache = NULL;
+ cifsi->netfs.cache = NULL;
}
}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 52355c0912ae..67b601041f0a 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -52,16 +52,16 @@ void cifs_fscache_fill_coherency(struct inode *inode,
struct cifsInodeInfo *cifsi = CIFS_I(inode);
memset(cd, 0, sizeof(*cd));
- cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec);
- cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec);
- cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec);
- cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec);
+ cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
+ cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
+ cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec);
+ cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec);
}
static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode)
{
- return netfs_i_cookie(inode);
+ return netfs_i_cookie(&CIFS_I(inode)->netfs);
}
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags)
@@ -108,17 +108,6 @@ static inline void cifs_readpage_to_fscache(struct inode *inode,
__cifs_readpage_to_fscache(inode, page);
}
-static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- if (PageFsCache(page)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- wait_on_page_fscache(page);
- fscache_note_page_release(cifs_inode_cookie(page->mapping->host));
- }
- return true;
-}
-
#else /* CONFIG_CIFS_FSCACHE */
static inline
void cifs_fscache_fill_coherency(struct inode *inode,
@@ -154,11 +143,6 @@ cifs_readpage_from_fscache(struct inode *inode, struct page *page)
static inline
void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {}
-static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
-{
- return true; /* May release page */
-}
-
#endif /* CONFIG_CIFS_FSCACHE */
#endif /* _CIFS_FSCACHE_H */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2f9e7d2f81b6..bac08c20f559 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -25,6 +25,7 @@
#include "fscache.h"
#include "fs_context.h"
#include "cifs_ioctl.h"
+#include "cached_dir.h"
static void cifs_set_ops(struct inode *inode)
{
@@ -115,7 +116,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
__func__, cifs_i->uniqueid);
set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
/* Invalidate fscache cookie */
- cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd);
+ cifs_fscache_fill_coherency(&cifs_i->netfs.inode, &cd);
fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0);
}
@@ -339,6 +340,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_get_file_info_unix(struct file *filp)
{
@@ -432,6 +434,14 @@ int cifs_get_inode_info_unix(struct inode **pinode,
cgiiu_exit:
return rc;
}
+#else
+int cifs_get_inode_info_unix(struct inode **pinode,
+ const unsigned char *full_path,
+ struct super_block *sb, unsigned int xid)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
@@ -795,6 +805,7 @@ static __u64 simple_hashstr(const char *str)
return hash;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/**
* cifs_backup_query_path_info - SMB1 fallback code to get ino
*
@@ -847,6 +858,7 @@ cifs_backup_query_path_info(int xid,
*data = (FILE_ALL_INFO *)info.srch_entries_start;
return 0;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static void
cifs_set_fattr_ino(int xid,
@@ -991,6 +1003,7 @@ cifs_get_inode_info(struct inode **inode,
rc = 0;
break;
case -EACCES:
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* perm errors, try again with backup flags if possible
*
@@ -1022,6 +1035,9 @@ cifs_get_inode_info(struct inode **inode,
/* nothing we can do, bail out */
goto out;
}
+#else
+ goto out;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
break;
default:
cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc);
@@ -1037,8 +1053,9 @@ cifs_get_inode_info(struct inode **inode,
/*
* 4. Tweak fattr based on mount options
*/
-
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
handle_mnt_opt:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* query for SFU type info if supported and needed */
if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -1223,7 +1240,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
static int
cifs_find_inode(struct inode *inode, void *opaque)
{
- struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+ struct cifs_fattr *fattr = opaque;
/* don't match inode with different uniqueid */
if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
@@ -1247,7 +1264,7 @@ cifs_find_inode(struct inode *inode, void *opaque)
static int
cifs_init_inode(struct inode *inode, void *opaque)
{
- struct cifs_fattr *fattr = (struct cifs_fattr *) opaque;
+ struct cifs_fattr *fattr = opaque;
CIFS_I(inode)->uniqueid = fattr->cf_uniqueid;
CIFS_I(inode)->createtime = fattr->cf_createtime;
@@ -1435,6 +1452,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
return server->ops->set_file_info(inode, full_path, &info_buf, xid);
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* Open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
* and rename it to a random name that hopefully won't conflict with
@@ -1565,6 +1583,7 @@ undo_setattr:
goto out_close;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* copied from fs/nfs/dir.c with small changes */
static void
@@ -1627,6 +1646,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
}
cifs_close_deferred_file_under_dentry(tcon, full_path);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = CIFSPOSIXDelFile(xid, tcon, full_path,
@@ -1636,6 +1656,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
if ((rc == 0) || (rc == -ENOENT))
goto psx_del_no_retry;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
retry_std_delete:
if (!server->ops->unlink) {
@@ -1714,9 +1735,11 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
if (tcon->posix_extensions)
rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
else if (tcon->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
xid);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
else
rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb,
xid, NULL);
@@ -1746,6 +1769,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
if (parent->i_mode & S_ISGID)
mode |= S_ISGID;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext) {
struct cifs_unix_set_info_args args = {
.mode = mode,
@@ -1768,6 +1792,9 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
} else {
+#else
+ {
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
struct TCP_Server_Info *server = tcon->ses->server;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
(mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo)
@@ -1788,6 +1815,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
return 0;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
const char *full_path, struct cifs_sb_info *cifs_sb,
@@ -1850,6 +1878,7 @@ posix_mkdir_get_info:
xid);
goto posix_mkdir_out;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
struct dentry *direntry, umode_t mode)
@@ -1892,6 +1921,7 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
goto mkdir_out;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
le64_to_cpu(tcon->fsUnixInfo.Capability))) {
rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb,
@@ -1899,6 +1929,7 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
if (rc != -EOPNOTSUPP)
goto mkdir_out;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (!server->ops->mkdir) {
rc = -ENOSYS;
@@ -2015,9 +2046,12 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct cifs_fid fid;
struct cifs_open_parms oparms;
- int oplock, rc;
+ int oplock;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
+ int rc;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -2043,6 +2077,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
if (server->vals->protocol_id != 0)
goto do_rename_exit;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/* open-file renames don't work across directories */
if (to_dentry->d_parent != from_dentry->d_parent)
goto do_rename_exit;
@@ -2064,6 +2099,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
cifs_sb->local_nls, cifs_remap(cifs_sb));
CIFSSMBClose(xid, tcon, fid.netfid);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
do_rename_exit:
if (rc == 0)
d_move(from_dentry, to_dentry);
@@ -2081,11 +2117,13 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
- FILE_UNIX_BASIC_INFO *info_buf_target;
unsigned int xid;
int rc, tmprc;
int retry_count = 0;
+ FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ FILE_UNIX_BASIC_INFO *info_buf_target;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -2139,6 +2177,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
if (flags & RENAME_NOREPLACE)
goto cifs_rename_exit;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (rc == -EEXIST && tcon->unix_ext) {
/*
* Are src and dst hardlinks of same inode? We can only tell
@@ -2178,6 +2217,8 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
*/
unlink_target:
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
+
/* Try unlinking the target dentry if it's not negative */
if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
if (d_is_dir(target_dentry))
@@ -2337,14 +2378,18 @@ int cifs_revalidate_file_attr(struct file *filp)
{
int rc = 0;
struct dentry *dentry = file_dentry(filp);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (!cifs_dentry_needs_reval(dentry))
return rc;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tlink_tcon(cfile->tlink)->unix_ext)
rc = cifs_get_file_info_unix(filp);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = cifs_get_file_info(filp);
return rc;
@@ -2499,7 +2544,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
u64 len)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
- struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->netfs.inode.i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct TCP_Server_Info *server = tcon->ses->server;
struct cifsFileInfo *cfile;
@@ -2653,6 +2698,7 @@ set_size_out:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
{
@@ -2800,6 +2846,7 @@ out:
free_xid(xid);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
static int
cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
@@ -2995,16 +3042,20 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry,
struct iattr *attrs)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
- struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
int rc, retries = 0;
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
+ struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
if (unlikely(cifs_forced_shutdown(cifs_sb)))
return -EIO;
do {
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (pTcon->unix_ext)
rc = cifs_setattr_unix(direntry, attrs);
else
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
rc = cifs_setattr_nounix(direntry, attrs);
retries++;
} while (is_retryable_error(rc) && retries < 2);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 0359b604bdbc..b6e6e5d6c8dd 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -333,6 +333,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
tcon = tlink_tcon(pSMBFile->tlink);
caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
#ifdef CONFIG_CIFS_POSIX
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (CIFS_UNIX_EXTATTR_CAP & caps) {
__u64 ExtAttrMask = 0;
rc = CIFSGetExtAttr(xid, tcon,
@@ -345,6 +346,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
if (rc != EOPNOTSUPP)
break;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
#endif /* CONFIG_CIFS_POSIX */
rc = 0;
if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 852e54ee82c2..6803cb27eecc 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -85,6 +85,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
if (rc != 1)
return -EINVAL;
+ if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+ return -EINVAL;
+
rc = symlink_hash(link_len, link_str, md5_hash);
if (rc) {
cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
@@ -283,6 +286,7 @@ out:
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/*
* SMB 1.0 Protocol specific functions
*/
@@ -365,6 +369,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
CIFSSMBClose(xid, tcon, fid.netfid);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/*
* SMB 2.1/SMB3 Protocol specific functions
@@ -529,11 +534,15 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
goto cifs_hl_exit;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
if (tcon->unix_ext)
rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
else {
+#else
+ {
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
server = tcon->ses->server;
if (!server->ops->create_hardlink) {
rc = -ENOSYS;
@@ -701,10 +710,12 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
/* BB what if DFS and this volume is on different share? BB */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
else if (pTcon->unix_ext)
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* else
rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName,
cifs_sb_target->local_nls); */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index afaf59c22193..87f60f736731 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -23,6 +23,7 @@
#include "dns_resolve.h"
#endif
#include "fs_context.h"
+#include "cached_dir.h"
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
@@ -69,12 +70,14 @@ sesInfoAlloc(void)
ret_buf = kzalloc(sizeof(struct cifs_ses), GFP_KERNEL);
if (ret_buf) {
atomic_inc(&sesInfoAllocCount);
- ret_buf->status = CifsNew;
+ spin_lock_init(&ret_buf->ses_lock);
+ ret_buf->ses_status = SES_NEW;
++ret_buf->ses_count;
INIT_LIST_HEAD(&ret_buf->smb_ses_list);
INIT_LIST_HEAD(&ret_buf->tcon_list);
mutex_init(&ret_buf->session_mutex);
spin_lock_init(&ret_buf->iface_lock);
+ INIT_LIST_HEAD(&ret_buf->iface_list);
spin_lock_init(&ret_buf->chan_lock);
}
return ret_buf;
@@ -83,6 +86,8 @@ sesInfoAlloc(void)
void
sesInfoFree(struct cifs_ses *buf_to_free)
{
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
+
if (buf_to_free == NULL) {
cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
return;
@@ -95,9 +100,12 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree_sensitive(buf_to_free->password);
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
- kfree(buf_to_free->workstation_name);
kfree_sensitive(buf_to_free->auth_key.response);
- kfree(buf_to_free->iface_list);
+ spin_lock(&buf_to_free->iface_lock);
+ list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list,
+ iface_head)
+ kref_put(&iface->refcount, release_iface);
+ spin_unlock(&buf_to_free->iface_lock);
kfree_sensitive(buf_to_free);
}
@@ -109,8 +117,8 @@ tconInfoAlloc(void)
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
- ret_buf->crfid.fid = kzalloc(sizeof(*ret_buf->crfid.fid), GFP_KERNEL);
- if (!ret_buf->crfid.fid) {
+ ret_buf->cfid = init_cached_dir();
+ if (!ret_buf->cfid) {
kfree(ret_buf);
return NULL;
}
@@ -118,10 +126,10 @@ tconInfoAlloc(void)
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
++ret_buf->tc_count;
+ spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
spin_lock_init(&ret_buf->open_file_lock);
- mutex_init(&ret_buf->crfid.fid_mutex);
spin_lock_init(&ret_buf->stat_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
@@ -130,17 +138,17 @@ tconInfoAlloc(void)
}
void
-tconInfoFree(struct cifs_tcon *buf_to_free)
+tconInfoFree(struct cifs_tcon *tcon)
{
- if (buf_to_free == NULL) {
+ if (tcon == NULL) {
cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
return;
}
+ free_cached_dir(tcon);
atomic_dec(&tconInfoAllocCount);
- kfree(buf_to_free->nativeFileSystem);
- kfree_sensitive(buf_to_free->password);
- kfree(buf_to_free->crfid.fid);
- kfree(buf_to_free);
+ kfree(tcon->nativeFileSystem);
+ kfree_sensitive(tcon->password);
+ kfree(tcon);
}
struct smb_hdr *
@@ -164,9 +172,9 @@ cifs_buf_get(void)
/* clear the first few header bytes */
/* for most paths, more is cleared in header_assemble */
memset(ret_buf, 0, buf_size + 3);
- atomic_inc(&bufAllocCount);
+ atomic_inc(&buf_alloc_count);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totBufAllocCount);
+ atomic_inc(&total_buf_alloc_count);
#endif /* CONFIG_CIFS_STATS2 */
return ret_buf;
@@ -181,7 +189,7 @@ cifs_buf_release(void *buf_to_free)
}
mempool_free(buf_to_free, cifs_req_poolp);
- atomic_dec(&bufAllocCount);
+ atomic_dec(&buf_alloc_count);
return;
}
@@ -197,9 +205,9 @@ cifs_small_buf_get(void)
ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
/* No need to clear memory here, cleared in header assemble */
/* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
- atomic_inc(&smBufAllocCount);
+ atomic_inc(&small_buf_alloc_count);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totSmBufAllocCount);
+ atomic_inc(&total_small_buf_alloc_count);
#endif /* CONFIG_CIFS_STATS2 */
return ret_buf;
@@ -215,7 +223,7 @@ cifs_small_buf_release(void *buf_to_free)
}
mempool_free(buf_to_free, cifs_sm_req_poolp);
- atomic_dec(&smBufAllocCount);
+ atomic_dec(&small_buf_alloc_count);
return;
}
@@ -346,7 +354,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
/* otherwise, there is enough to get to the BCC */
if (check_smb_hdr(smb))
return -EIO;
- clc_len = smbCalcSize(smb, server);
+ clc_len = smbCalcSize(smb);
if (4 + rfclen != total_read) {
cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
@@ -392,7 +400,6 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
{
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
- struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *pCifsInode;
@@ -459,18 +466,14 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &srv->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- list_for_each(tmp1, &ses->tcon_list) {
- tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &srv->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid != buf->Tid)
continue;
cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp2, &tcon->openFileList) {
- netfile = list_entry(tmp2, struct cifsFileInfo,
- tlist);
+ list_for_each_entry(netfile, &tcon->openFileList, tlist) {
if (pSMB->Fid != netfile->fid.netfid)
continue;
@@ -536,11 +539,11 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
if (oplock == OPLOCK_EXCLUSIVE) {
cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == OPLOCK_READ) {
cinode->oplock = CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else
cinode->oplock = 0;
}
@@ -734,6 +737,8 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ cifs_del_deferred_close(cfile);
+
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
@@ -755,16 +760,16 @@ void
cifs_close_all_deferred_files(struct cifs_tcon *tcon)
{
struct cifsFileInfo *cfile;
- struct list_head *tmp;
struct file_list *tmp_list, *tmp_next_list;
struct list_head file_head;
INIT_LIST_HEAD(&file_head);
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ cifs_del_deferred_close(cfile);
+
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
@@ -785,7 +790,6 @@ void
cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
{
struct cifsFileInfo *cfile;
- struct list_head *tmp;
struct file_list *tmp_list, *tmp_next_list;
struct list_head file_head;
void *page;
@@ -794,12 +798,13 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
INIT_LIST_HEAD(&file_head);
page = alloc_dentry_path();
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
full_path = build_path_from_dentry(cfile->dentry, page);
if (strstr(full_path, path)) {
if (delayed_work_pending(&cfile->deferred)) {
if (cancel_delayed_work(&cfile->deferred)) {
+ cifs_del_deferred_close(cfile);
+
tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC);
if (tmp_list == NULL)
break;
@@ -1021,7 +1026,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
saved_len = count;
while (count && npages < max_pages) {
- rc = iov_iter_get_pages(iter, pages, count, max_pages, &start);
+ rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start);
if (rc < 0) {
cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc);
break;
@@ -1033,7 +1038,6 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
break;
}
- iov_iter_advance(iter, rc);
count -= rc;
rc += start;
cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
@@ -1210,18 +1214,23 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void
.data = data,
.sb = NULL,
};
+ struct file_system_type **fs_type = (struct file_system_type *[]) {
+ &cifs_fs_type, &smb3_fs_type, NULL,
+ };
- iterate_supers_type(&cifs_fs_type, f, &sd);
-
- if (!sd.sb)
- return ERR_PTR(-EINVAL);
- /*
- * Grab an active reference in order to prevent automounts (DFS links)
- * of expiring and then freeing up our cifs superblock pointer while
- * we're doing failover.
- */
- cifs_sb_active(sd.sb);
- return sd.sb;
+ for (; *fs_type; fs_type++) {
+ iterate_supers_type(*fs_type, f, &sd);
+ if (sd.sb) {
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(sd.sb);
+ return sd.sb;
+ }
+ }
+ return ERR_PTR(-EINVAL);
}
static void __cifs_put_super(struct super_block *sb)
@@ -1309,7 +1318,7 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
* for "\<server>\<dfsname>\<linkpath>" DFS reference,
* where <dfsname> contains non-ASCII unicode symbols.
*
- * Check such DFS reference and emulate -ENOENT if it is actual.
+ * Check such DFS reference.
*/
int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
struct cifs_tcon *tcon,
@@ -1341,10 +1350,6 @@ int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n",
dfspath);
rc = -EREMOTE;
- } else if (rc == -EEXIST) {
- cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n",
- dfspath);
- rc = -ENOENT;
} else {
cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc);
}
diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c
index 291cb606f149..147d9409252c 100644
--- a/fs/cifs/netlink.c
+++ b/fs/cifs/netlink.c
@@ -51,6 +51,7 @@ struct genl_family cifs_genl_family = {
.policy = cifs_genl_policy,
.ops = cifs_genl_ops,
.n_ops = ARRAY_SIZE(cifs_genl_ops),
+ .resv_start_op = CIFS_GENL_CMD_SWN_NOTIFY + 1,
.mcgrps = cifs_genl_mcgrps,
.n_mcgrps = ARRAY_SIZE(cifs_genl_mcgrps),
};
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index ebe236b9d9f5..1b52e6ac431c 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -896,7 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr)
if (class == ERRSRV && code == ERRbaduid) {
cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n",
code);
- cifs_reconnect(mid->server, false);
+ cifs_signal_cifsd_for_reconnect(mid->server, false);
}
}
@@ -909,9 +909,9 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr)
* portion, the number of word parameters and the data portion of the message
*/
unsigned int
-smbCalcSize(void *buf, struct TCP_Server_Info *server)
+smbCalcSize(void *buf)
{
- struct smb_hdr *ptr = (struct smb_hdr *)buf;
+ struct smb_hdr *ptr = buf;
return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
2 /* size of the bcc field */ + get_bcc(ptr));
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 1929e80c09ee..8e060c00c969 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -21,6 +21,7 @@
#include "cifsfs.h"
#include "smb2proto.h"
#include "fs_context.h"
+#include "cached_dir.h"
/*
* To be safe - for UCS to UTF-8 with strings loaded with the rare long
@@ -805,8 +806,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
end_of_smb = cfile->srch_inf.ntwrk_buf_start +
server->ops->calc_smb_size(
- cfile->srch_inf.ntwrk_buf_start,
- server);
+ cfile->srch_inf.ntwrk_buf_start);
cur_ent = cfile->srch_inf.srch_entries_start;
first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
@@ -840,9 +840,109 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
return rc;
}
+static bool emit_cached_dirents(struct cached_dirents *cde,
+ struct dir_context *ctx)
+{
+ struct cached_dirent *dirent;
+ int rc;
+
+ list_for_each_entry(dirent, &cde->entries, entry) {
+ if (ctx->pos >= dirent->pos)
+ continue;
+ ctx->pos = dirent->pos;
+ rc = dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->fattr.cf_uniqueid,
+ dirent->fattr.cf_dtype);
+ if (!rc)
+ return rc;
+ }
+ return true;
+}
+
+static void update_cached_dirents_count(struct cached_dirents *cde,
+ struct dir_context *ctx)
+{
+ if (cde->ctx != ctx)
+ return;
+ if (cde->is_valid || cde->is_failed)
+ return;
+
+ cde->pos++;
+}
+
+static void finished_cached_dirents_count(struct cached_dirents *cde,
+ struct dir_context *ctx)
+{
+ if (cde->ctx != ctx)
+ return;
+ if (cde->is_valid || cde->is_failed)
+ return;
+ if (ctx->pos != cde->pos)
+ return;
+
+ cde->is_valid = 1;
+}
+
+static void add_cached_dirent(struct cached_dirents *cde,
+ struct dir_context *ctx,
+ const char *name, int namelen,
+ struct cifs_fattr *fattr)
+{
+ struct cached_dirent *de;
+
+ if (cde->ctx != ctx)
+ return;
+ if (cde->is_valid || cde->is_failed)
+ return;
+ if (ctx->pos != cde->pos) {
+ cde->is_failed = 1;
+ return;
+ }
+ de = kzalloc(sizeof(*de), GFP_ATOMIC);
+ if (de == NULL) {
+ cde->is_failed = 1;
+ return;
+ }
+ de->namelen = namelen;
+ de->name = kstrndup(name, namelen, GFP_ATOMIC);
+ if (de->name == NULL) {
+ kfree(de);
+ cde->is_failed = 1;
+ return;
+ }
+ de->pos = ctx->pos;
+
+ memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr));
+
+ list_add_tail(&de->entry, &cde->entries);
+}
+
+static bool cifs_dir_emit(struct dir_context *ctx,
+ const char *name, int namelen,
+ struct cifs_fattr *fattr,
+ struct cached_fid *cfid)
+{
+ bool rc;
+ ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
+
+ rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype);
+ if (!rc)
+ return rc;
+
+ if (cfid) {
+ mutex_lock(&cfid->dirents.de_mutex);
+ add_cached_dirent(&cfid->dirents, ctx, name, namelen,
+ fattr);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ }
+
+ return rc;
+}
+
static int cifs_filldir(char *find_entry, struct file *file,
- struct dir_context *ctx,
- char *scratch_buf, unsigned int max_len)
+ struct dir_context *ctx,
+ char *scratch_buf, unsigned int max_len,
+ struct cached_fid *cfid)
{
struct cifsFileInfo *file_info = file->private_data;
struct super_block *sb = file_inode(file)->i_sb;
@@ -851,7 +951,6 @@ static int cifs_filldir(char *find_entry, struct file *file,
struct cifs_fattr fattr;
struct qstr name;
int rc = 0;
- ino_t ino;
rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level,
file_info->srch_inf.unicode);
@@ -931,8 +1030,8 @@ static int cifs_filldir(char *find_entry, struct file *file,
cifs_prime_dcache(file_dentry(file), &name, &fattr);
- ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
- return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
+ return !cifs_dir_emit(ctx, name.name, name.len,
+ &fattr, cfid);
}
@@ -941,8 +1040,9 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
int rc = 0;
unsigned int xid;
int i;
+ struct tcon_link *tlink = NULL;
struct cifs_tcon *tcon;
- struct cifsFileInfo *cifsFile = NULL;
+ struct cifsFileInfo *cifsFile;
char *current_entry;
int num_to_fill = 0;
char *tmp_buf = NULL;
@@ -950,6 +1050,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
unsigned int max_len;
const char *full_path;
void *page = alloc_dentry_path();
+ struct cached_fid *cfid = NULL;
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
xid = get_xid();
@@ -959,6 +1061,54 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
goto rddir2_exit;
}
+ if (file->private_data == NULL) {
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ goto cache_not_found;
+ tcon = tlink_tcon(tlink);
+ } else {
+ cifsFile = file->private_data;
+ tcon = tlink_tcon(cifsFile->tlink);
+ }
+
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
+ cifs_put_tlink(tlink);
+ if (rc)
+ goto cache_not_found;
+
+ mutex_lock(&cfid->dirents.de_mutex);
+ /*
+ * If this was reading from the start of the directory
+ * we need to initialize scanning and storing the
+ * directory content.
+ */
+ if (ctx->pos == 0 && cfid->dirents.ctx == NULL) {
+ cfid->dirents.ctx = ctx;
+ cfid->dirents.pos = 2;
+ }
+ /*
+ * If we already have the entire directory cached then
+ * we can just serve the cache.
+ */
+ if (cfid->dirents.is_valid) {
+ if (!dir_emit_dots(file, ctx)) {
+ mutex_unlock(&cfid->dirents.de_mutex);
+ goto rddir2_exit;
+ }
+ emit_cached_dirents(&cfid->dirents, ctx);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ goto rddir2_exit;
+ }
+ mutex_unlock(&cfid->dirents.de_mutex);
+
+ /* Drop the cache while calling initiate_cifs_search and
+ * find_cifs_entry in case there will be reconnects during
+ * query_directory.
+ */
+ close_cached_dir(cfid);
+ cfid = NULL;
+
+ cache_not_found:
/*
* Ensure FindFirst doesn't fail before doing filldir() for '.' and
* '..'. Otherwise we won't be able to notify VFS in case of failure.
@@ -977,7 +1127,6 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
is in current search buffer?
if it before then restart search
if after then keep searching till find it */
-
cifsFile = file->private_data;
if (cifsFile->srch_inf.endOfSearch) {
if (cifsFile->srch_inf.emptyDir) {
@@ -993,20 +1142,25 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
tcon = tlink_tcon(cifsFile->tlink);
rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path,
&current_entry, &num_to_fill);
+ open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
if (rc) {
cifs_dbg(FYI, "fce error %d\n", rc);
goto rddir2_exit;
} else if (current_entry != NULL) {
cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
} else {
+ if (cfid) {
+ mutex_lock(&cfid->dirents.de_mutex);
+ finished_cached_dirents_count(&cfid->dirents, ctx);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ }
cifs_dbg(FYI, "Could not find entry\n");
goto rddir2_exit;
}
cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
max_len = tcon->ses->server->ops->calc_smb_size(
- cifsFile->srch_inf.ntwrk_buf_start,
- tcon->ses->server);
+ cifsFile->srch_inf.ntwrk_buf_start);
end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
@@ -1028,7 +1182,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
*/
*tmp_buf = 0;
rc = cifs_filldir(current_entry, file, ctx,
- tmp_buf, max_len);
+ tmp_buf, max_len, cfid);
if (rc) {
if (rc > 0)
rc = 0;
@@ -1036,6 +1190,12 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
}
ctx->pos++;
+ if (cfid) {
+ mutex_lock(&cfid->dirents.de_mutex);
+ update_cached_dirents_count(&cfid->dirents, ctx);
+ mutex_unlock(&cfid->dirents.de_mutex);
+ }
+
if (ctx->pos ==
cifsFile->srch_inf.index_of_last_entry) {
cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
@@ -1050,6 +1210,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
kfree(tmp_buf);
rddir2_exit:
+ if (cfid)
+ close_cached_dir(cfid);
free_dentry_path(page);
free_xid(xid);
return rc;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 32f478c7a66d..3af3b05b6c74 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
- if (is_server_using_iface(ses->chans[i].server, iface)) {
+ if (ses->chans[i].iface == iface) {
spin_unlock(&ses->chan_lock);
return true;
}
@@ -81,11 +81,41 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
}
/* If we didn't find the channel, it is likely a bug */
+ if (server)
+ cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
+ server->conn_id);
WARN_ON(1);
return 0;
}
void
+cifs_chan_set_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ ses->chans[chan_index].in_reconnect = true;
+}
+
+void
+cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ ses->chans[chan_index].in_reconnect = false;
+}
+
+bool
+cifs_chan_in_reconnect(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ return CIFS_CHAN_IN_RECONNECT(ses, chan_index);
+}
+
+void
cifs_chan_set_need_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
@@ -116,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
}
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ return ses->chans[chan_index].iface &&
+ ses->chans[chan_index].iface->is_active;
+}
+
/* returns number of channels added */
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
{
int old_chan_count, new_chan_count;
int left;
- int i = 0;
int rc = 0;
int tries = 0;
- struct cifs_server_iface *ifaces = NULL;
- size_t iface_count;
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
spin_lock(&ses->chan_lock);
@@ -155,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
spin_unlock(&ses->chan_lock);
/*
- * Make a copy of the iface list at the time and use that
- * instead so as to not hold the iface spinlock for opening
- * channels
- */
- spin_lock(&ses->iface_lock);
- iface_count = ses->iface_count;
- if (iface_count <= 0) {
- spin_unlock(&ses->iface_lock);
- cifs_dbg(VFS, "no iface list available to open channels\n");
- return 0;
- }
- ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces),
- GFP_ATOMIC);
- if (!ifaces) {
- spin_unlock(&ses->iface_lock);
- return 0;
- }
- spin_unlock(&ses->iface_lock);
-
- /*
* Keep connecting to same, fastest, iface for all channels as
* long as its RSS. Try next fastest one if not RSS or channel
* creation fails.
*/
+ spin_lock(&ses->iface_lock);
+ iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ spin_unlock(&ses->iface_lock);
+
while (left > 0) {
- struct cifs_server_iface *iface;
tries++;
if (tries > 3*ses->chan_max) {
@@ -189,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
break;
}
- iface = &ifaces[i];
- if (is_ses_using_iface(ses, iface) && !iface->rss_capable) {
- i = (i+1) % iface_count;
- continue;
+ spin_lock(&ses->iface_lock);
+ if (!ses->iface_count) {
+ spin_unlock(&ses->iface_lock);
+ break;
}
- rc = cifs_ses_add_channel(cifs_sb, ses, iface);
- if (rc) {
- cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n",
- i, rc);
- i = (i+1) % iface_count;
- continue;
+ list_for_each_entry_safe_from(iface, niface, &ses->iface_list,
+ iface_head) {
+ /* skip ifaces that are unusable */
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+
+ /* take ref before unlock */
+ kref_get(&iface->refcount);
+
+ spin_unlock(&ses->iface_lock);
+ rc = cifs_ses_add_channel(cifs_sb, ses, iface);
+ spin_lock(&ses->iface_lock);
+
+ if (rc) {
+ cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n",
+ &iface->sockaddr,
+ rc);
+ kref_put(&iface->refcount, release_iface);
+ continue;
+ }
+
+ cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n",
+ &iface->sockaddr);
+ break;
}
+ spin_unlock(&ses->iface_lock);
- cifs_dbg(FYI, "successfully opened new channel on iface#%d\n",
- i);
left--;
new_chan_count++;
}
- kfree(ifaces);
return new_chan_count - old_chan_count;
}
/*
+ * update the iface for the channel if necessary.
+ * will return 0 when iface is updated, 1 if removed, 2 otherwise
+ * Must be called with chan_lock held.
+ */
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
+{
+ unsigned int chan_index;
+ struct cifs_server_iface *iface = NULL;
+ struct cifs_server_iface *old_iface = NULL;
+ int rc = 0;
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (!chan_index) {
+ spin_unlock(&ses->chan_lock);
+ return 0;
+ }
+
+ if (ses->chans[chan_index].iface) {
+ old_iface = ses->chans[chan_index].iface;
+ if (old_iface->is_active) {
+ spin_unlock(&ses->chan_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&ses->chan_lock);
+
+ spin_lock(&ses->iface_lock);
+ /* then look for a new one */
+ list_for_each_entry(iface, &ses->iface_list, iface_head) {
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+ kref_get(&iface->refcount);
+ }
+
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ rc = 1;
+ iface = NULL;
+ cifs_dbg(FYI, "unable to find a suitable iface\n");
+ }
+
+ /* now drop the ref to the current iface */
+ if (old_iface && iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
+ &old_iface->sockaddr,
+ &iface->sockaddr);
+ } else if (old_iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
+ &old_iface->sockaddr);
+ } else {
+ WARN_ON(!iface);
+ cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
+ }
+ spin_unlock(&ses->iface_lock);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ ses->chans[chan_index].iface = iface;
+
+ /* No iface is found. if secondary chan, drop connection */
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ ses->chans[chan_index].server = NULL;
+
+ spin_unlock(&ses->chan_lock);
+
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ cifs_put_tcp_session(server, false);
+
+ return rc;
+}
+
+/*
* If server is a channel of ses, return the corresponding enclosing
* cifs_chan otherwise return NULL.
*/
@@ -274,7 +393,10 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
/* Auth */
ctx.domainauto = ses->domainAuto;
ctx.domainname = ses->domainName;
- ctx.server_hostname = ses->server->hostname;
+
+ /* no hostname for extra channels */
+ ctx.server_hostname = "";
+
ctx.username = ses->user_name;
ctx.password = ses->password;
ctx.sectype = ses->sectype;
@@ -322,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
spin_unlock(&ses->chan_lock);
goto out;
}
+ chan->iface = iface;
ses->chan_count++;
atomic_set(&ses->chan_seq, 0);
@@ -351,6 +474,14 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
out:
if (rc && chan->server) {
+ /*
+ * we should avoid race with these delayed works before we
+ * remove this channel
+ */
+ cancel_delayed_work_sync(&chan->server->echo);
+ cancel_delayed_work_sync(&chan->server->resolve);
+ cancel_delayed_work_sync(&chan->server->reconnect);
+
spin_lock(&ses->chan_lock);
/* we rely on all bits beyond chan_count to be clear */
cifs_chan_clear_need_reconnect(ses, chan->server);
@@ -361,14 +492,14 @@ out:
*/
WARN_ON(ses->chan_count < 1);
spin_unlock(&ses->chan_lock);
- }
- if (rc && chan->server)
cifs_put_tcp_session(chan->server, 0);
+ }
return rc;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
struct TCP_Server_Info *server,
SESSION_SETUP_ANDX *pSMB)
@@ -461,7 +592,6 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*pbcc_area = bcc_ptr;
}
-
static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -623,6 +753,7 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
for it later, but it is not very important */
cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
struct cifs_ses *ses)
@@ -714,9 +845,9 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size)
else
sz += sizeof(__le16);
- if (ses->workstation_name)
+ if (ses->workstation_name[0])
sz += sizeof(__le16) * strnlen(ses->workstation_name,
- CIFS_MAX_WORKSTATION_LEN);
+ ntlmssp_workstation_name_size(ses));
else
sz += sizeof(__le16);
@@ -960,7 +1091,7 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
cifs_security_buffer_from_str(&sec_blob->WorkstationName,
ses->workstation_name,
- CIFS_MAX_WORKSTATION_LEN,
+ ntlmssp_workstation_name_size(ses),
*pbuffer, &tmp,
nls_cp);
@@ -1040,6 +1171,7 @@ struct sess_data {
struct kvec iov[3];
};
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static int
sess_alloc_buffer(struct sess_data *sess_data, int wct)
{
@@ -1093,14 +1225,14 @@ sess_establish_session(struct sess_data *sess_data)
struct cifs_ses *ses = sess_data->ses;
struct TCP_Server_Info *server = sess_data->server;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (!server->session_estab) {
if (server->sign) {
server->session_key.response =
kmemdup(ses->auth_key.response,
ses->auth_key.len, GFP_KERNEL);
if (!server->session_key.response) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return -ENOMEM;
}
server->session_key.len =
@@ -1109,7 +1241,7 @@ sess_establish_session(struct sess_data *sess_data)
server->sequence_number = 0x2;
server->session_estab = true;
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "CIFS session established successfully\n");
return 0;
@@ -1716,3 +1848,4 @@ out:
kfree(sess_data);
return rc;
}
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index c71c9a44bef4..f36b2d2d40ca 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -38,10 +38,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
in_buf->WordCount = 0;
put_bcc(0, in_buf);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return rc;
}
@@ -55,7 +55,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc < 0)
server->sequence_number--;
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
get_mid(in_buf), rc);
@@ -92,17 +92,17 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct mid_q_entry *mid;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if (compare_mid(mid->mid, buf) &&
mid->mid_state == MID_REQUEST_SUBMITTED &&
le16_to_cpu(mid->command) == buf->Command) {
kref_get(&mid->refcount);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return mid;
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return NULL;
}
@@ -166,7 +166,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
__u16 last_mid, cur_mid;
bool collision, reconnect = false;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
/* mid is 16 bit only for CIFS/SMB */
cur_mid = (__u16)((server->CurrentMid) & 0xffff);
@@ -225,7 +225,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
}
cur_mid++;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
if (reconnect) {
cifs_signal_cifsd_for_reconnect(server, false);
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index f5dcc4940b6d..9dfd2dd612c2 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -61,7 +61,6 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
nr_ioctl_req.Reserved = 0;
rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY,
- true /* is_fsctl */,
(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
CIFSMaxBufSize, NULL, NULL /* no return info */);
if (rc == -EOPNOTSUPP) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index fe5bfa245fa7..b83f59051b26 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -23,6 +23,7 @@
#include "smb2glob.h"
#include "smb2pdu.h"
#include "smb2proto.h"
+#include "cached_dir.h"
static void
free_set_inf_compound(struct smb_rqst *rqst)
@@ -362,8 +363,6 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
num_rqst++;
if (cfile) {
- cifsFileInfo_put(cfile);
- cfile = NULL;
rc = compound_send_recv(xid, ses, server,
flags, num_rqst - 2,
&rqst[1], &resp_buftype[1],
@@ -514,16 +513,19 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if (smb2_data == NULL)
return -ENOMEM;
+ if (strcmp(full_path, ""))
+ rc = -ENOENT;
+ else
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
/* If it is a root and its handle is cached then use it */
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
if (!rc) {
- if (tcon->crfid.file_all_info_is_valid) {
+ if (cfid->file_all_info_is_valid) {
move_smb2_info_to_cifs(data,
- &tcon->crfid.file_all_info);
+ &cfid->file_all_info);
} else {
rc = SMB2_query_info(xid, tcon,
- cfid->fid->persistent_fid,
- cfid->fid->volatile_fid, smb2_data);
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid, smb2_data);
if (!rc)
move_smb2_info_to_cifs(data, smb2_data);
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index c653beb735b8..d73e5672aac4 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -16,6 +16,7 @@
#include "smb2status.h"
#include "smb2glob.h"
#include "nterr.h"
+#include "cached_dir.h"
static int
check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid)
@@ -132,15 +133,15 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
}
int
-smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
+smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
- __u64 mid;
- __u32 clc_len; /* calculated length */
- int command;
- int pdu_size = sizeof(struct smb2_pdu);
int hdr_size = sizeof(struct smb2_hdr);
+ int pdu_size = sizeof(struct smb2_pdu);
+ int command;
+ __u32 calc_len; /* calculated length */
+ __u64 mid;
/*
* Add function to do table lookup of StructureSize by command
@@ -150,16 +151,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
struct smb2_transform_hdr *thdr =
(struct smb2_transform_hdr *)buf;
struct cifs_ses *ses = NULL;
+ struct cifs_ses *iter;
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) {
- if (ses->Suid == le64_to_cpu(thdr->SessionId))
+ list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) {
+ if (iter->Suid == le64_to_cpu(thdr->SessionId)) {
+ ses = iter;
break;
+ }
}
spin_unlock(&cifs_tcp_ses_lock);
- if (list_entry_is_head(ses, &srvr->smb_ses_list,
- smb_ses_list)) {
+ if (!ses) {
cifs_dbg(VFS, "no decryption - session id not found\n");
return 1;
}
@@ -219,30 +222,33 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
}
}
- clc_len = smb2_calc_size(buf, srvr);
+ calc_len = smb2_calc_size(buf);
+
+ /* For SMB2_IOCTL, OutputOffset and OutputLength are optional, so might
+ * be 0, and not a real miscalculation */
+ if (command == SMB2_IOCTL_HE && calc_len == 0)
+ return 0;
- if (shdr->Command == SMB2_NEGOTIATE)
- clc_len += get_neg_ctxt_len(shdr, len, clc_len);
+ if (command == SMB2_NEGOTIATE_HE)
+ calc_len += get_neg_ctxt_len(shdr, len, calc_len);
- if (len != clc_len) {
- cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
- clc_len, len, mid);
+ if (len != calc_len) {
/* create failed on symlink */
if (command == SMB2_CREATE_HE &&
shdr->Status == STATUS_STOPPED_ON_SYMLINK)
return 0;
/* Windows 7 server returns 24 bytes more */
- if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
+ if (calc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE)
return 0;
/* server can return one byte more due to implied bcc[0] */
- if (clc_len == len + 1)
+ if (calc_len == len + 1)
return 0;
/*
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
- if (((clc_len + 7) & ~7) == len)
+ if (((calc_len + 7) & ~7) == len)
return 0;
/*
@@ -251,12 +257,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
* SMB2/SMB3 frame length (header + smb2 response specific data)
* Some windows servers also pad up to 8 bytes when compounding.
*/
- if (clc_len < len)
+ if (calc_len < len)
return 0;
- pr_warn_once(
- "srv rsp too short, len %d not %d. cmd:%d mid:%llu\n",
- len, clc_len, command, mid);
+ /* Only log a message if len was really miscalculated */
+ if (unlikely(cifsFYI))
+ cifs_dbg(FYI, "Server response too short: calculated "
+ "length %u doesn't match read length %u (cmd=%d, mid=%llu)\n",
+ calc_len, len, command, mid);
+ else
+ pr_warn("Server response too short: calculated length "
+ "%u doesn't match read length %u (cmd=%d, mid=%llu)\n",
+ calc_len, len, command, mid);
return 1;
}
@@ -398,9 +410,9 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
* portion, the number of word parameters and the data portion of the message.
*/
unsigned int
-smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
+smb2_calc_size(void *buf)
{
- struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+ struct smb2_pdu *pdu = buf;
struct smb2_hdr *shdr = &pdu->hdr;
int offset; /* the offset from the beginning of SMB to data area */
int data_length; /* the length of the variable length data area */
@@ -637,15 +649,7 @@ smb2_is_valid_lease_break(char *buffer)
}
spin_unlock(&tcon->open_file_lock);
- if (tcon->crfid.is_valid &&
- !memcmp(rsp->LeaseKey,
- tcon->crfid.fid->lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- tcon->crfid.time = 0;
- INIT_WORK(&tcon->crfid.lease_break,
- smb2_cached_lease_break);
- queue_work(cifsiod_wq,
- &tcon->crfid.lease_break);
+ if (cached_dir_lease_break(tcon, rsp->LeaseKey)) {
spin_unlock(&cifs_tcp_ses_lock);
return true;
}
@@ -654,6 +658,12 @@ smb2_is_valid_lease_break(char *buffer)
}
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
+ trace_smb3_lease_not_found(le32_to_cpu(rsp->CurrentLeaseState),
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId),
+ *((u64 *)rsp->LeaseKey),
+ *((u64 *)&rsp->LeaseKey[8]));
+
return false;
}
@@ -724,6 +734,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
}
spin_unlock(&cifs_tcp_ses_lock);
cifs_dbg(FYI, "No file id matched, oplock break ignored\n");
+ trace_smb3_oplock_not_found(0 /* no xid */, rsp->PersistentFid,
+ le32_to_cpu(rsp->hdr.Id.SyncId.TreeId),
+ le64_to_cpu(rsp->hdr.SessionId));
+
return true;
}
@@ -796,7 +810,7 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
if (tcon->ses)
server = tcon->ses->server;
- cifs_server_dbg(FYI, "tid=%u: tcon is closing, skipping async close retry of fid %llu %llu\n",
+ cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n",
tcon->tid, persistent_fid, volatile_fid);
return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index db23f5b404ba..421be43af425 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -27,6 +27,7 @@
#include "smbdirect.h"
#include "fscache.h"
#include "fs_context.h"
+#include "cached_dir.h"
/* Change credits for different ops and return the total number of credits */
static int
@@ -86,6 +87,9 @@ smb2_add_credits(struct TCP_Server_Info *server,
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
pr_warn_once("server overflowed SMB3 credits\n");
+ trace_smb3_overflow_credits(server->CurrentMid,
+ server->conn_id, server->hostname, *val,
+ add, server->in_flight);
}
server->in_flight--;
if (server->in_flight == 0 &&
@@ -123,13 +127,13 @@ smb2_add_credits(struct TCP_Server_Info *server,
optype, scredits, add);
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect
|| server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
switch (rc) {
case -1:
@@ -215,12 +219,12 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
spin_lock(&server->req_lock);
} else {
spin_unlock(&server->req_lock);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
spin_lock(&server->req_lock);
scredits = server->credits;
@@ -251,7 +255,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_wait_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits, -(credits->value), in_flight);
cifs_dbg(FYI, "%s: removed %u credits total=%d\n",
__func__, credits->value, scredits);
@@ -300,7 +304,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_adj_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
credits->value - new_val, in_flight);
cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n",
@@ -316,19 +320,19 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
{
__u64 mid;
/* for SMB2 we need the current value */
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
mid = server->CurrentMid++;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return mid;
}
static void
smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
{
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
if (server->CurrentMid >= val)
server->CurrentMid -= val;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
static struct mid_q_entry *
@@ -343,7 +347,7 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
return NULL;
}
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if ((mid->mid == wire_mid) &&
(mid->mid_state == MID_REQUEST_SUBMITTED) &&
@@ -353,11 +357,11 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return mid;
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return NULL;
}
@@ -383,7 +387,7 @@ smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
shdr->Id.SyncId.ProcessId);
cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
- server->ops->calc_smb_size(buf, server));
+ server->ops->calc_smb_size(buf));
#endif
}
@@ -400,9 +404,9 @@ smb2_negotiate(const unsigned int xid,
{
int rc;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
server->CurrentMid = 0;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
rc = SMB2_negotiate(xid, ses, server);
/* BB we probably don't need to retry with modern servers */
if (rc == -EAGAIN)
@@ -509,73 +513,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
size_t buf_len,
- struct cifs_server_iface **iface_list,
- size_t *iface_count)
+ struct cifs_ses *ses)
{
struct network_interface_info_ioctl_rsp *p;
struct sockaddr_in *addr4;
struct sockaddr_in6 *addr6;
struct iface_info_ipv4 *p4;
struct iface_info_ipv6 *p6;
- struct cifs_server_iface *info;
+ struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL;
+ struct cifs_server_iface tmp_iface;
ssize_t bytes_left;
size_t next = 0;
int nb_iface = 0;
- int rc = 0;
-
- *iface_list = NULL;
- *iface_count = 0;
-
- /*
- * Fist pass: count and sanity check
- */
+ int rc = 0, ret = 0;
bytes_left = buf_len;
p = buf;
- while (bytes_left >= sizeof(*p)) {
- nb_iface++;
- next = le32_to_cpu(p->Next);
- if (!next) {
- bytes_left -= sizeof(*p);
- break;
- }
- p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
- bytes_left -= next;
- }
-
- if (!nb_iface) {
- cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
- rc = -EINVAL;
- goto out;
- }
-
- /* Azure rounds the buffer size up 8, to a 16 byte boundary */
- if ((bytes_left > 8) || p->Next)
- cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
+ spin_lock(&ses->iface_lock);
/*
- * Second pass: extract info to internal structure
+ * Go through iface_list and do kref_put to remove
+ * any unused ifaces. ifaces in use will be removed
+ * when the last user calls a kref_put on it
*/
-
- *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL);
- if (!*iface_list) {
- rc = -ENOMEM;
- goto out;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ iface->is_active = 0;
+ kref_put(&iface->refcount, release_iface);
}
+ spin_unlock(&ses->iface_lock);
- info = *iface_list;
- bytes_left = buf_len;
- p = buf;
while (bytes_left >= sizeof(*p)) {
- info->speed = le64_to_cpu(p->LinkSpeed);
- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
-
- cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
- cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
- cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
- le32_to_cpu(p->Capability));
+ memset(&tmp_iface, 0, sizeof(tmp_iface));
+ tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
+ tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
+ tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
switch (p->Family) {
/*
@@ -584,7 +556,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
* conversion explicit in case either one changes.
*/
case INTERNETWORK:
- addr4 = (struct sockaddr_in *)&info->sockaddr;
+ addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr;
p4 = (struct iface_info_ipv4 *)p->Buffer;
addr4->sin_family = AF_INET;
memcpy(&addr4->sin_addr, &p4->IPv4Address, 4);
@@ -596,7 +568,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
&addr4->sin_addr);
break;
case INTERNETWORKV6:
- addr6 = (struct sockaddr_in6 *)&info->sockaddr;
+ addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr;
p6 = (struct iface_info_ipv6 *)p->Buffer;
addr6->sin6_family = AF_INET6;
memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16);
@@ -616,50 +588,100 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
goto next_iface;
}
- (*iface_count)++;
- info++;
+ /*
+ * The iface_list is assumed to be sorted by speed.
+ * Check if the new interface exists in that list.
+ * NEVER change iface. it could be in use.
+ * Add a new one instead
+ */
+ spin_lock(&ses->iface_lock);
+ iface = niface = NULL;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ ret = iface_cmp(iface, &tmp_iface);
+ if (!ret) {
+ /* just get a ref so that it doesn't get picked/freed */
+ iface->is_active = 1;
+ kref_get(&iface->refcount);
+ spin_unlock(&ses->iface_lock);
+ goto next_iface;
+ } else if (ret < 0) {
+ /* all remaining ifaces are slower */
+ kref_get(&iface->refcount);
+ break;
+ }
+ }
+ spin_unlock(&ses->iface_lock);
+
+ /* no match. insert the entry in the list */
+ info = kmalloc(sizeof(struct cifs_server_iface),
+ GFP_KERNEL);
+ if (!info) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(info, &tmp_iface, sizeof(tmp_iface));
+
+ /* add this new entry to the list */
+ kref_init(&info->refcount);
+ info->is_active = 1;
+
+ cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count);
+ cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
+ cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
+ le32_to_cpu(p->Capability));
+
+ spin_lock(&ses->iface_lock);
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_add_tail(&info->iface_head, &iface->iface_head);
+ kref_put(&iface->refcount, release_iface);
+ } else
+ list_add_tail(&info->iface_head, &ses->iface_list);
+ spin_unlock(&ses->iface_lock);
+
+ ses->iface_count++;
+ ses->iface_last_update = jiffies;
next_iface:
+ nb_iface++;
next = le32_to_cpu(p->Next);
- if (!next)
+ if (!next) {
+ bytes_left -= sizeof(*p);
break;
+ }
p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
bytes_left -= next;
}
- if (!*iface_count) {
+ if (!nb_iface) {
+ cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
rc = -EINVAL;
goto out;
}
-out:
- if (rc) {
- kfree(*iface_list);
- *iface_count = 0;
- *iface_list = NULL;
- }
- return rc;
-}
+ /* Azure rounds the buffer size up 8, to a 16 byte boundary */
+ if ((bytes_left > 8) || p->Next)
+ cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-static int compare_iface(const void *ia, const void *ib)
-{
- const struct cifs_server_iface *a = (struct cifs_server_iface *)ia;
- const struct cifs_server_iface *b = (struct cifs_server_iface *)ib;
- return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1);
+ if (!ses->iface_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+out:
+ return rc;
}
-static int
+int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
{
int rc;
unsigned int ret_data_len = 0;
struct network_interface_info_ioctl_rsp *out_buf = NULL;
- struct cifs_server_iface *iface_list;
- size_t iface_count;
struct cifs_ses *ses = tcon->ses;
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
- FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+ FSCTL_QUERY_NETWORK_INTERFACE_INFO,
NULL /* no data input */, 0 /* no data input */,
CIFSMaxBufSize, (char **)&out_buf, &ret_data_len);
if (rc == -EOPNOTSUPP) {
@@ -671,300 +693,16 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
goto out;
}
- rc = parse_server_interfaces(out_buf, ret_data_len,
- &iface_list, &iface_count);
+ rc = parse_server_interfaces(out_buf, ret_data_len, ses);
if (rc)
goto out;
- /* sort interfaces from fastest to slowest */
- sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL);
-
- spin_lock(&ses->iface_lock);
- kfree(ses->iface_list);
- ses->iface_list = iface_list;
- ses->iface_count = iface_count;
- ses->iface_last_update = jiffies;
- spin_unlock(&ses->iface_lock);
-
out:
kfree(out_buf);
return rc;
}
static void
-smb2_close_cached_fid(struct kref *ref)
-{
- struct cached_fid *cfid = container_of(ref, struct cached_fid,
- refcount);
-
- if (cfid->is_valid) {
- cifs_dbg(FYI, "clear cached root file handle\n");
- SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid,
- cfid->fid->volatile_fid);
- }
-
- /*
- * We only check validity above to send SMB2_close,
- * but we still need to invalidate these entries
- * when this function is called
- */
- cfid->is_valid = false;
- cfid->file_all_info_is_valid = false;
- cfid->has_lease = false;
- if (cfid->dentry) {
- dput(cfid->dentry);
- cfid->dentry = NULL;
- }
-}
-
-void close_cached_dir(struct cached_fid *cfid)
-{
- mutex_lock(&cfid->fid_mutex);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- mutex_unlock(&cfid->fid_mutex);
-}
-
-void close_cached_dir_lease_locked(struct cached_fid *cfid)
-{
- if (cfid->has_lease) {
- cfid->has_lease = false;
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- }
-}
-
-void close_cached_dir_lease(struct cached_fid *cfid)
-{
- mutex_lock(&cfid->fid_mutex);
- close_cached_dir_lease_locked(cfid);
- mutex_unlock(&cfid->fid_mutex);
-}
-
-void
-smb2_cached_lease_break(struct work_struct *work)
-{
- struct cached_fid *cfid = container_of(work,
- struct cached_fid, lease_break);
-
- close_cached_dir_lease(cfid);
-}
-
-/*
- * Open the and cache a directory handle.
- * Only supported for the root handle.
- */
-int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
- const char *path,
- struct cifs_sb_info *cifs_sb,
- struct cached_fid **cfid)
-{
- struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = ses->server;
- struct cifs_open_parms oparms;
- struct smb2_create_rsp *o_rsp = NULL;
- struct smb2_query_info_rsp *qi_rsp = NULL;
- int resp_buftype[2];
- struct smb_rqst rqst[2];
- struct kvec rsp_iov[2];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
- struct kvec qi_iov[1];
- int rc, flags = 0;
- __le16 utf16_path = 0; /* Null - since an open of top of share */
- u8 oplock = SMB2_OPLOCK_LEVEL_II;
- struct cifs_fid *pfid;
- struct dentry *dentry;
-
- if (tcon->nohandlecache)
- return -ENOTSUPP;
-
- if (cifs_sb->root == NULL)
- return -ENOENT;
-
- if (strlen(path))
- return -ENOENT;
-
- dentry = cifs_sb->root;
-
- mutex_lock(&tcon->crfid.fid_mutex);
- if (tcon->crfid.is_valid) {
- cifs_dbg(FYI, "found a cached root file handle\n");
- *cfid = &tcon->crfid;
- kref_get(&tcon->crfid.refcount);
- mutex_unlock(&tcon->crfid.fid_mutex);
- return 0;
- }
-
- /*
- * We do not hold the lock for the open because in case
- * SMB2_open needs to reconnect, it will end up calling
- * cifs_mark_open_files_invalid() which takes the lock again
- * thus causing a deadlock
- */
-
- mutex_unlock(&tcon->crfid.fid_mutex);
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- if (!server->ops->new_lease_key)
- return -EIO;
-
- pfid = tcon->crfid.fid;
- server->ops->new_lease_key(pfid);
-
- memset(rqst, 0, sizeof(rqst));
- resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
- memset(rsp_iov, 0, sizeof(rsp_iov));
-
- /* Open */
- memset(&open_iov, 0, sizeof(open_iov));
- rqst[0].rq_iov = open_iov;
- rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
-
- oparms.tcon = tcon;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.fid = pfid;
- oparms.reconnect = false;
-
- rc = SMB2_open_init(tcon, server,
- &rqst[0], &oplock, &oparms, &utf16_path);
- if (rc)
- goto oshr_free;
- smb2_set_next_command(tcon, &rqst[0]);
-
- memset(&qi_iov, 0, sizeof(qi_iov));
- rqst[1].rq_iov = qi_iov;
- rqst[1].rq_nvec = 1;
-
- rc = SMB2_query_info_init(tcon, server,
- &rqst[1], COMPOUND_FID,
- COMPOUND_FID, FILE_ALL_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb2_file_all_info) +
- PATH_MAX * 2, 0, NULL);
- if (rc)
- goto oshr_free;
-
- smb2_set_related(&rqst[1]);
-
- rc = compound_send_recv(xid, ses, server,
- flags, 2, rqst,
- resp_buftype, rsp_iov);
- mutex_lock(&tcon->crfid.fid_mutex);
-
- /*
- * Now we need to check again as the cached root might have
- * been successfully re-opened from a concurrent process
- */
-
- if (tcon->crfid.is_valid) {
- /* work was already done */
-
- /* stash fids for close() later */
- struct cifs_fid fid = {
- .persistent_fid = pfid->persistent_fid,
- .volatile_fid = pfid->volatile_fid,
- };
-
- /*
- * caller expects this func to set the fid in crfid to valid
- * cached root, so increment the refcount.
- */
- kref_get(&tcon->crfid.refcount);
-
- mutex_unlock(&tcon->crfid.fid_mutex);
-
- if (rc == 0) {
- /* close extra handle outside of crit sec */
- SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
- }
- rc = 0;
- goto oshr_free;
- }
-
- /* Cached root is still invalid, continue normaly */
-
- if (rc) {
- if (rc == -EREMCHG) {
- tcon->need_reconnect = true;
- pr_warn_once("server share %s deleted\n",
- tcon->treeName);
- }
- goto oshr_exit;
- }
-
- atomic_inc(&tcon->num_remote_opens);
-
- o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
- oparms.fid->persistent_fid = o_rsp->PersistentFileId;
- oparms.fid->volatile_fid = o_rsp->VolatileFileId;
-#ifdef CONFIG_CIFS_DEBUG2
- oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
-#endif /* CIFS_DEBUG2 */
-
- tcon->crfid.tcon = tcon;
- tcon->crfid.is_valid = true;
- tcon->crfid.dentry = dentry;
- dget(dentry);
- kref_init(&tcon->crfid.refcount);
-
- /* BB TBD check to see if oplock level check can be removed below */
- if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
- /*
- * See commit 2f94a3125b87. Increment the refcount when we
- * get a lease for root, release it if lease break occurs
- */
- kref_get(&tcon->crfid.refcount);
- tcon->crfid.has_lease = true;
- smb2_parse_contexts(server, o_rsp,
- &oparms.fid->epoch,
- oparms.fid->lease_key, &oplock,
- NULL, NULL);
- } else
- goto oshr_exit;
-
- qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
- if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
- goto oshr_exit;
- if (!smb2_validate_and_copy_iov(
- le16_to_cpu(qi_rsp->OutputBufferOffset),
- sizeof(struct smb2_file_all_info),
- &rsp_iov[1], sizeof(struct smb2_file_all_info),
- (char *)&tcon->crfid.file_all_info))
- tcon->crfid.file_all_info_is_valid = true;
- tcon->crfid.time = jiffies;
-
-
-oshr_exit:
- mutex_unlock(&tcon->crfid.fid_mutex);
-oshr_free:
- SMB2_open_free(&rqst[0]);
- SMB2_query_info_free(&rqst[1]);
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- if (rc == 0)
- *cfid = &tcon->crfid;
- return rc;
-}
-
-int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
- struct dentry *dentry,
- struct cached_fid **cfid)
-{
- mutex_lock(&tcon->crfid.fid_mutex);
- if (tcon->crfid.dentry == dentry) {
- cifs_dbg(FYI, "found a cached root file handle by dentry\n");
- *cfid = &tcon->crfid;
- kref_get(&tcon->crfid.refcount);
- mutex_unlock(&tcon->crfid.fid_mutex);
- return 0;
- }
- mutex_unlock(&tcon->crfid.fid_mutex);
- return -ENOENT;
-}
-
-static void
smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb)
{
@@ -982,9 +720,9 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = open_cached_dir(xid, tcon, "", cifs_sb, &cfid);
+ rc = open_cached_dir(xid, tcon, "", cifs_sb, false, &cfid);
if (rc == 0)
- memcpy(&fid, cfid->fid, sizeof(struct cifs_fid));
+ memcpy(&fid, &cfid->fid, sizeof(struct cifs_fid));
else
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
NULL, NULL);
@@ -1045,9 +783,16 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
+ struct cached_fid *cfid;
- if ((*full_path == 0) && tcon->crfid.is_valid)
- return 0;
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid);
+ if (!rc) {
+ if (cfid->is_valid) {
+ close_cached_dir(cfid);
+ return 0;
+ }
+ close_cached_dir(cfid);
+ }
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
@@ -1114,9 +859,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
size_t name_len, value_len, user_name_len;
while (src_size > 0) {
- name = &src->ea_data[0];
name_len = (size_t)src->ea_name_length;
- value = &src->ea_data[src->ea_name_length + 1];
value_len = (size_t)le16_to_cpu(src->ea_value_length);
if (name_len == 0)
@@ -1128,6 +871,9 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
goto out;
}
+ name = &src->ea_data[0];
+ value = &src->ea_data[src->ea_name_length + 1];
+
if (ea_name) {
if (ea_name_len == name_len &&
memcmp(ea_name, name, name_len) == 0) {
@@ -1577,9 +1323,8 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
struct resume_key_req *res_key;
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
- FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
- NULL, 0 /* no input */, CIFSMaxBufSize,
- (char **)&res_key, &ret_data_len);
+ FSCTL_SRV_REQUEST_RESUME_KEY, NULL, 0 /* no input */,
+ CIFSMaxBufSize, (char **)&res_key, &ret_data_len);
if (rc == -EOPNOTSUPP) {
pr_warn_once("Server share %s does not support copy range\n", tcon->treeName);
@@ -1721,7 +1466,7 @@ smb2_ioctl_query_info(const unsigned int xid,
rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
- qi.info_type, true, buffer, qi.output_buffer_length,
+ qi.info_type, buffer, qi.output_buffer_length,
CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
free_req1_func = SMB2_ioctl_free;
@@ -1857,7 +1602,6 @@ smb2_copychunk_range(const unsigned int xid,
ssize_t bytes_written, total_bytes_written = 0;
pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
-
if (pcchunk == NULL)
return -ENOMEM;
@@ -1889,9 +1633,8 @@ smb2_copychunk_range(const unsigned int xid,
retbuf = NULL;
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
- true /* is_fsctl */, (char *)pcchunk,
- sizeof(struct copychunk_ioctl), CIFSMaxBufSize,
- (char **)&retbuf, &ret_data_len);
+ (char *)pcchunk, sizeof(struct copychunk_ioctl),
+ CIFSMaxBufSize, (char **)&retbuf, &ret_data_len);
if (rc == 0) {
if (ret_data_len !=
sizeof(struct copychunk_ioctl_rsp)) {
@@ -2051,7 +1794,6 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
- true /* is_fctl */,
&setsparse, 1, CIFSMaxBufSize, NULL, NULL);
if (rc) {
tcon->broken_sparse_sup = true;
@@ -2134,7 +1876,6 @@ smb2_duplicate_extents(const unsigned int xid,
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid,
FSCTL_DUPLICATE_EXTENTS_TO_FILE,
- true /* is_fsctl */,
(char *)&dup_ext_buf,
sizeof(struct duplicate_extents_to_file),
CIFSMaxBufSize, NULL,
@@ -2169,7 +1910,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SET_INTEGRITY_INFORMATION,
- true /* is_fsctl */,
(char *)&integr_info,
sizeof(struct fsctl_set_integrity_information_req),
CIFSMaxBufSize, NULL,
@@ -2222,7 +1962,6 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SRV_ENUMERATE_SNAPSHOTS,
- true /* is_fsctl */,
NULL, 0 /* no input data */, max_response_size,
(char **)&retbuf,
&ret_data_len);
@@ -2492,7 +2231,7 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_pend_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n",
@@ -2535,7 +2274,6 @@ static void
smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
{
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
- struct list_head *tmp, *tmp1;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -2543,12 +2281,12 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
return;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- list_for_each(tmp1, &ses->tcon_list) {
- tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
+ spin_lock(&tcon->tc_lock);
tcon->need_reconnect = true;
+ spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
pr_warn_once("Server share %s deleted.\n",
tcon->treeName);
@@ -2684,7 +2422,12 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
- rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid);
+ /*
+ * We can only call this for things we know are directories.
+ */
+ if (!strcmp(path, ""))
+ open_cached_dir(xid, tcon, path, cifs_sb, false,
+ &cfid); /* cfid null if open dir failed */
memset(&open_iov, 0, sizeof(open_iov));
rqst[0].rq_iov = open_iov;
@@ -2710,8 +2453,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
if (cfid) {
rc = SMB2_query_info_init(tcon, server,
&rqst[1],
- cfid->fid->persistent_fid,
- cfid->fid->volatile_fid,
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid,
class, type, 0,
output_len, 0,
NULL);
@@ -2941,7 +2684,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
do {
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_DFS_GET_REFERRALS,
- true /* is_fsctl */,
(char *)dfs_req, dfs_req_size, CIFSMaxBufSize,
(char **)&dfs_rsp, &dfs_rsp_size);
if (!is_retryable_error(rc))
@@ -3148,8 +2890,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl_init(tcon, server,
&rqst[1], fid.persistent_fid,
- fid.volatile_fid, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0,
+ fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0,
CIFSMaxBufSize -
MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
@@ -3329,8 +3070,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl_init(tcon, server,
&rqst[1], COMPOUND_FID,
- COMPOUND_FID, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0,
+ COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0,
CIFSMaxBufSize -
MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
@@ -3558,26 +3298,43 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb,
return pntsd;
}
+static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon,
+ loff_t offset, loff_t len, unsigned int xid)
+{
+ struct cifsFileInfo *cfile = file->private_data;
+ struct file_zero_data_information fsctl_buf;
+
+ cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
+
+ fsctl_buf.FileOffset = cpu_to_le64(offset);
+ fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+
+ return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
+ (char *)&fsctl_buf,
+ sizeof(struct file_zero_data_information),
+ 0, NULL, NULL);
+}
+
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len, bool keep_size)
{
struct cifs_ses *ses = tcon->ses;
- struct inode *inode;
- struct cifsInodeInfo *cifsi;
+ struct inode *inode = file_inode(file);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifsFileInfo *cfile = file->private_data;
- struct file_zero_data_information fsctl_buf;
long rc;
unsigned int xid;
__le64 eof;
xid = get_xid();
- inode = d_inode(cfile->dentry);
- cifsi = CIFS_I(inode);
-
trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
ses->Suid, offset, len);
+ inode_lock(inode);
+ filemap_invalidate_lock(inode->i_mapping);
+
/*
* We zero the range through ioctl, so we need remove the page caches
* first, otherwise the data may be inconsistent with the server.
@@ -3585,26 +3342,12 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
truncate_pagecache_range(inode, offset, offset + len - 1);
/* if file not oplocked can't be sure whether asking to extend size */
- if (!CIFS_CACHE_READ(cifsi))
- if (keep_size == false) {
- rc = -EOPNOTSUPP;
- trace_smb3_zero_err(xid, cfile->fid.persistent_fid,
- tcon->tid, ses->Suid, offset, len, rc);
- free_xid(xid);
- return rc;
- }
-
- cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
-
- fsctl_buf.FileOffset = cpu_to_le64(offset);
- fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
+ rc = -EOPNOTSUPP;
+ if (keep_size == false && !CIFS_CACHE_READ(cifsi))
+ goto zero_range_exit;
- rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true,
- (char *)&fsctl_buf,
- sizeof(struct file_zero_data_information),
- 0, NULL, NULL);
- if (rc)
+ rc = smb3_zero_data(file, tcon, offset, len, xid);
+ if (rc < 0)
goto zero_range_exit;
/*
@@ -3617,6 +3360,8 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
}
zero_range_exit:
+ filemap_invalidate_unlock(inode->i_mapping);
+ inode_unlock(inode);
free_xid(xid);
if (rc)
trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid,
@@ -3630,7 +3375,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3639,14 +3384,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
filemap_invalidate_lock(inode->i_mapping);
@@ -3663,11 +3406,13 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, (char *)&fsctl_buf,
+ (char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
- free_xid(xid);
filemap_invalidate_unlock(inode->i_mapping);
+out:
+ inode_unlock(inode);
+ free_xid(xid);
return rc;
}
@@ -3723,7 +3468,7 @@ static int smb3_simple_fallocate_range(unsigned int xid,
in_data.length = cpu_to_le64(len);
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
1024 * sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -3826,7 +3571,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (rc)
goto out;
- if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0)
+ if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)
smb2_set_sparse(xid, tcon, cfile, inode, false);
eof = cpu_to_le64(off + len);
@@ -3924,39 +3669,50 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
{
int rc;
unsigned int xid;
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
- struct cifsInodeInfo *cifsi;
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
__le64 eof;
+ loff_t old_eof;
xid = get_xid();
- inode = d_inode(cfile->dentry);
- cifsi = CIFS_I(inode);
+ inode_lock(inode);
- if (off >= i_size_read(inode) ||
- off + len >= i_size_read(inode)) {
+ old_eof = i_size_read(inode);
+ if ((off >= old_eof) ||
+ off + len >= old_eof) {
rc = -EINVAL;
goto out;
}
+ filemap_invalidate_lock(inode->i_mapping);
+ rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof - 1);
+ if (rc < 0)
+ goto out_2;
+
+ truncate_pagecache_range(inode, off, old_eof);
+
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
- i_size_read(inode) - off - len, off);
+ old_eof - off - len, off);
if (rc < 0)
- goto out;
+ goto out_2;
- eof = cpu_to_le64(i_size_read(inode) - len);
+ eof = cpu_to_le64(old_eof - len);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
- goto out;
+ goto out_2;
rc = 0;
cifsi->server_eof = i_size_read(inode) - len;
truncate_setsize(inode, cifsi->server_eof);
fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
+out_2:
+ filemap_invalidate_unlock(inode->i_mapping);
out:
+ inode_unlock(inode);
free_xid(xid);
return rc;
}
@@ -3967,34 +3723,47 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
int rc;
unsigned int xid;
struct cifsFileInfo *cfile = file->private_data;
+ struct inode *inode = file_inode(file);
__le64 eof;
- __u64 count;
+ __u64 count, old_eof;
xid = get_xid();
- if (off >= i_size_read(file->f_inode)) {
+ inode_lock(inode);
+
+ old_eof = i_size_read(inode);
+ if (off >= old_eof) {
rc = -EINVAL;
goto out;
}
- count = i_size_read(file->f_inode) - off;
- eof = cpu_to_le64(i_size_read(file->f_inode) + len);
+ count = old_eof - off;
+ eof = cpu_to_le64(old_eof + len);
+
+ filemap_invalidate_lock(inode->i_mapping);
+ rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1);
+ if (rc < 0)
+ goto out_2;
+ truncate_pagecache_range(inode, off, old_eof);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
- goto out;
+ goto out_2;
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
if (rc < 0)
- goto out;
+ goto out_2;
- rc = smb3_zero_range(file, tcon, off, len, 1);
+ rc = smb3_zero_data(file, tcon, off, len, xid);
if (rc < 0)
- goto out;
+ goto out_2;
rc = 0;
+out_2:
+ filemap_invalidate_unlock(inode->i_mapping);
out:
+ inode_unlock(inode);
free_xid(xid);
return rc;
}
@@ -4044,7 +3813,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -4104,7 +3873,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon,
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
1024 * sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -4227,15 +3996,15 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
cinode->oplock = CIFS_CACHE_RHW_FLG;
cifs_dbg(FYI, "Batch Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
cinode->oplock = CIFS_CACHE_RW_FLG;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
cinode->oplock = CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else
cinode->oplock = 0;
}
@@ -4274,7 +4043,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
cinode->oplock = new_oplock;
cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
}
static void
@@ -4312,11 +4081,13 @@ smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
}
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static bool
smb2_is_read_op(__u32 oplock)
{
return oplock == SMB2_OPLOCK_LEVEL_II;
}
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
static bool
smb21_is_read_op(__u32 oplock)
@@ -4521,9 +4292,11 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
if (ses->Suid == ses_id) {
+ spin_lock(&ses->ses_lock);
ses_enc_key = enc ? ses->smb3encryptionkey :
ses->smb3decryptionkey;
memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ spin_unlock(&ses->ses_lock);
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
@@ -5038,23 +4811,24 @@ static void smb2_decrypt_offload(struct work_struct *work)
mid->callback(mid);
} else {
- spin_lock(&cifs_tcp_ses_lock);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&dw->server->srv_lock);
if (dw->server->tcpStatus == CifsNeedReconnect) {
+ spin_lock(&dw->server->mid_lock);
mid->mid_state = MID_RETRY_NEEDED;
- spin_unlock(&GlobalMid_Lock);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&dw->server->mid_lock);
+ spin_unlock(&dw->server->srv_lock);
mid->callback(mid);
} else {
+ spin_lock(&dw->server->mid_lock);
mid->mid_state = MID_REQUEST_SUBMITTED;
mid->mid_flags &= ~(MID_DELETED);
list_add_tail(&mid->qhead,
&dw->server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&dw->server->mid_lock);
+ spin_unlock(&dw->server->srv_lock);
}
}
- cifs_mid_q_entry_release(mid);
+ release_mid(mid);
}
free_pages:
@@ -5415,7 +5189,7 @@ out:
return rc;
}
-
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -5514,6 +5288,7 @@ struct smb_version_operations smb20_operations = {
.is_status_io_timeout = smb2_is_status_io_timeout,
.is_network_name_deleted = smb2_is_network_name_deleted,
};
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
struct smb_version_operations smb21_operations = {
.compare_fids = smb2_compare_fids,
@@ -5845,6 +5620,7 @@ struct smb_version_operations smb311_operations = {
.is_network_name_deleted = smb2_is_network_name_deleted,
};
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_version_values smb20_values = {
.version_string = SMB20_VERSION_STRING,
.protocol_id = SMB20_PROT_ID,
@@ -5865,6 +5641,7 @@ struct smb_version_values smb20_values = {
.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
.create_lease_size = sizeof(struct create_lease),
};
+#endif /* ALLOW_INSECURE_LEGACY */
struct smb_version_values smb21_values = {
.version_string = SMB21_VERSION_STRING,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 1b7ad0c09566..6352ab32c7e7 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -39,6 +39,7 @@
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
+#include "cached_dir.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -162,7 +163,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
return 0;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&tcon->tc_lock);
if (tcon->status == TID_EXITING) {
/*
* only tree disconnect, open, and write,
@@ -172,14 +173,14 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
if ((smb2_command != SMB2_WRITE) &&
(smb2_command != SMB2_CREATE) &&
(smb2_command != SMB2_TREE_DISCONNECT)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&tcon->tc_lock);
cifs_dbg(FYI, "can not send cmd %d while umounting\n",
smb2_command);
return -ENODEV;
}
}
- spin_unlock(&cifs_tcp_ses_lock);
- if ((!tcon->ses) || (tcon->ses->status == CifsExiting) ||
+ spin_unlock(&tcon->tc_lock);
+ if ((!tcon->ses) || (tcon->ses->ses_status == SES_EXITING) ||
(!tcon->ses->server) || !server)
return -EIO;
@@ -217,12 +218,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
}
/* are we still trying to reconnect? */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
break;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (retries && --retries)
continue;
@@ -256,13 +257,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
* and the server never sends an answer the socket will be closed
* and tcpStatus set to reconnect.
*/
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = -EHOSTDOWN;
goto out;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* need to prevent multiple threads trying to simultaneously
@@ -288,6 +289,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
mutex_unlock(&ses->session_mutex);
rc = -EHOSTDOWN;
goto failed;
+ } else if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ goto out;
}
} else {
mutex_unlock(&ses->session_mutex);
@@ -351,7 +355,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
void *buf,
unsigned int *total_len)
{
- struct smb2_pdu *spdu = (struct smb2_pdu *)buf;
+ struct smb2_pdu *spdu = buf;
/* lookup word count ie StructureSize from table */
__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)];
@@ -540,6 +544,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
struct TCP_Server_Info *server, unsigned int *total_len)
{
char *pneg_ctxt;
+ char *hostname = NULL;
unsigned int ctxt_len, neg_context_count;
if (*total_len > 200) {
@@ -567,16 +572,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
- ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
- server->hostname);
- *total_len += ctxt_len;
- pneg_ctxt += ctxt_len;
+ /*
+ * secondary channels don't have the hostname field populated
+ * use the hostname field in the primary channel instead
+ */
+ hostname = CIFS_SERVER_IS_CHAN(server) ?
+ server->primary_server->hostname : server->hostname;
+ if (hostname && (hostname[0] != 0)) {
+ ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
+ hostname);
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+ neg_context_count = 3;
+ } else
+ neg_context_count = 2;
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
pneg_ctxt += sizeof(struct smb2_posix_neg_context);
-
- neg_context_count = 4;
+ neg_context_count++;
if (server->compress_algorithm) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
@@ -951,16 +965,17 @@ SMB2_negotiate(const unsigned int xid,
} else if (rc != 0)
goto neg_exit;
+ rc = -EIO;
if (strcmp(server->vals->version_string,
SMB3ANY_VERSION_STRING) == 0) {
if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) {
cifs_server_dbg(VFS,
"SMB2 dialect returned but not requested\n");
- return -EIO;
+ goto neg_exit;
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
cifs_server_dbg(VFS,
"SMB2.1 dialect returned but not requested\n");
- return -EIO;
+ goto neg_exit;
} else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
server->ops = &smb311_operations;
@@ -971,7 +986,7 @@ SMB2_negotiate(const unsigned int xid,
if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID)) {
cifs_server_dbg(VFS,
"SMB2 dialect returned but not requested\n");
- return -EIO;
+ goto neg_exit;
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
server->ops = &smb21_operations;
@@ -985,7 +1000,7 @@ SMB2_negotiate(const unsigned int xid,
/* if requested single dialect ensure returned dialect matched */
cifs_server_dbg(VFS, "Invalid 0x%x dialect returned: not requested\n",
le16_to_cpu(rsp->DialectRevision));
- return -EIO;
+ goto neg_exit;
}
cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode);
@@ -1003,9 +1018,10 @@ SMB2_negotiate(const unsigned int xid,
else {
cifs_server_dbg(VFS, "Invalid dialect returned by server 0x%x\n",
le16_to_cpu(rsp->DialectRevision));
- rc = -EIO;
goto neg_exit;
}
+
+ rc = 0;
server->dialect = le16_to_cpu(rsp->DialectRevision);
/*
@@ -1159,7 +1175,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
}
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
- FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
+ FSCTL_VALIDATE_NEGOTIATE_INFO,
(char *)pneg_inbuf, inbuflen, CIFSMaxBufSize,
(char **)&pneg_rsp, &rsplen);
if (rc == -EOPNOTSUPP) {
@@ -1369,13 +1385,13 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
struct cifs_ses *ses = sess_data->ses;
struct TCP_Server_Info *server = sess_data->server;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
if (server->ops->generate_signingkey) {
rc = server->ops->generate_signingkey(ses, server);
if (rc) {
cifs_dbg(FYI,
"SMB3 session key generation failed\n");
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return rc;
}
}
@@ -1383,7 +1399,7 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
server->sequence_number = 0x2;
server->session_estab = true;
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
cifs_dbg(FYI, "SMB2/3 session established successfully\n");
return rc;
@@ -1914,7 +1930,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
- strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
+ strscpy(tcon->treeName, tree, sizeof(tcon->treeName));
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -1965,7 +1981,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
}
spin_unlock(&ses->chan_lock);
- close_cached_dir_lease(&tcon->crfid);
+ invalidate_all_cached_dirs(tcon);
rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server,
(void **) &req,
@@ -2558,19 +2574,15 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
path_len = UniStrnlen((wchar_t *)path, PATH_MAX);
- /*
- * make room for one path separator between the treename and
- * path
- */
- *out_len = treename_len + 1 + path_len;
+ /* make room for one path separator only if @path isn't empty */
+ *out_len = treename_len + (path[0] ? 1 : 0) + path_len;
/*
- * final path needs to be null-terminated UTF16 with a
- * size aligned to 8
+ * final path needs to be 8-byte aligned as specified in
+ * MS-SMB2 2.2.13 SMB2 CREATE Request.
*/
-
- *out_size = roundup((*out_len+1)*2, 8);
- *out_path = kzalloc(*out_size, GFP_KERNEL);
+ *out_size = roundup(*out_len * sizeof(__le16), 8);
+ *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL);
if (!*out_path)
return -ENOMEM;
@@ -3042,7 +3054,7 @@ int
SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ char *in_data, u32 indatalen,
__u32 max_response_size)
{
struct smb2_ioctl_req *req;
@@ -3117,10 +3129,8 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
req->hdr.CreditCharge =
cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
SMB2_MAX_BUFFER_SIZE));
- if (is_fsctl)
- req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
- else
- req->Flags = 0;
+ /* always an FSCTL (for now) */
+ req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
@@ -3147,9 +3157,9 @@ SMB2_ioctl_free(struct smb_rqst *rqst)
*/
int
SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl,
- char *in_data, u32 indatalen, u32 max_out_data_len,
- char **out_data, u32 *plen /* returned data len */)
+ u64 volatile_fid, u32 opcode, char *in_data, u32 indatalen,
+ u32 max_out_data_len, char **out_data,
+ u32 *plen /* returned data len */)
{
struct smb_rqst rqst;
struct smb2_ioctl_rsp *rsp = NULL;
@@ -3191,7 +3201,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
rc = SMB2_ioctl_init(tcon, server,
&rqst, persistent_fid, volatile_fid, opcode,
- is_fsctl, in_data, indatalen, max_out_data_len);
+ in_data, indatalen, max_out_data_len);
if (rc)
goto ioctl_exit;
@@ -3283,7 +3293,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
- FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+ FSCTL_SET_COMPRESSION,
(char *)&fsctl_input /* data input */,
2 /* in data len */, CIFSMaxBufSize /* max out data */,
&ret_data /* out data */, NULL);
@@ -3763,7 +3773,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
credits.instance = server->reconnect_instance;
}
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, CIFS_ECHO_OP);
}
@@ -3898,14 +3908,15 @@ SMB2_echo(struct TCP_Server_Info *server)
cifs_dbg(FYI, "In echo request for conn_id %lld\n", server->conn_id);
- spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsNeedNegotiate) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
+ if (server->ops->need_neg &&
+ server->ops->need_neg(server)) {
+ spin_unlock(&server->srv_lock);
/* No need to send echo on newly established connections */
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
return rc;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = smb2_plain_req_init(SMB2_ECHO, NULL, server,
(void **)&req, &total_len);
@@ -4187,7 +4198,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
rdata->offset, rdata->got_bytes);
queue_work(cifsiod_wq, &rdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -4426,7 +4437,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->offset, wdata->bytes);
queue_work(cifsiod_wq, &wdata->work);
- DeleteMidQEntry(mid);
+ release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -5150,6 +5161,8 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
data = &info;
size = sizeof(struct smb2_file_eof_info);
+ trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof));
+
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
0, 1, &data, &size);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index d8c4388b190d..f57881b8464f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -260,28 +260,6 @@ struct get_retrieval_pointers_refcount_rsp {
struct smb3_extents extents[];
} __packed;
-struct fsctl_set_integrity_information_req {
- __le16 ChecksumAlgorithm;
- __le16 Reserved;
- __le32 Flags;
-} __packed;
-
-struct fsctl_get_integrity_information_rsp {
- __le16 ChecksumAlgorithm;
- __le16 Reserved;
- __le32 Flags;
- __le32 ChecksumChunkSizeInBytes;
- __le32 ClusterSizeInBytes;
-} __packed;
-
-/* Integrity ChecksumAlgorithm choices for above */
-#define CHECKSUM_TYPE_NONE 0x0000
-#define CHECKSUM_TYPE_CRC64 0x0002
-#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */
-
-/* Integrity flags for above */
-#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
-
/* See MS-DFSC 2.2.2 */
struct fsctl_get_dfs_referral_req {
__le16 MaxReferralLevel;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index a69f1eed1cfe..3f740f24b96a 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -23,7 +23,7 @@ struct smb_rqst;
extern int map_smb2_to_linux_error(char *buf, bool log_err);
extern int smb2_check_message(char *buf, unsigned int length,
struct TCP_Server_Info *server);
-extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
+extern unsigned int smb2_calc_size(void *buf);
extern char *smb2_get_data_area_len(int *off, int *len,
struct smb2_hdr *shdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
@@ -54,16 +54,6 @@ extern bool smb2_is_valid_oplock_break(char *buffer,
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
-extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
- const char *path,
- struct cifs_sb_info *cifs_sb,
- struct cached_fid **cfid);
-extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
- struct dentry *dentry,
- struct cached_fid **cfid);
-extern void close_cached_dir(struct cached_fid *cfid);
-extern void close_cached_dir_lease(struct cached_fid *cfid);
-extern void close_cached_dir_lease_locked(struct cached_fid *cfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
@@ -147,13 +137,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon,
extern void SMB2_open_free(struct smb_rqst *rqst);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen,
+ char *in_data, u32 indatalen, u32 maxoutlen,
char **out_data, u32 *plen /* returned data len */);
extern int SMB2_ioctl_init(struct cifs_tcon *tcon,
struct TCP_Server_Info *server,
struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ char *in_data, u32 indatalen,
__u32 max_response_size);
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2af79093b78b..1a5fc3314dbf 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -640,12 +640,13 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (!is_signed)
return 0;
- spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus == CifsNeedNegotiate) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
+ if (server->ops->need_neg &&
+ server->ops->need_neg(server)) {
+ spin_unlock(&server->srv_lock);
return 0;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (!is_binding && !server->session_estab) {
strncpy(shdr->Signature, "BSRSPYL", 8);
return 0;
@@ -749,7 +750,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
- atomic_inc(&midCount);
+ atomic_inc(&mid_count);
temp->mid_state = MID_REQUEST_ALLOCATED;
trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId),
le64_to_cpu(shdr->SessionId),
@@ -761,48 +762,50 @@ static int
smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
struct smb2_hdr *shdr, struct mid_q_entry **mid)
{
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
if (server->tcpStatus == CifsNeedReconnect) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
return -EAGAIN;
}
if (server->tcpStatus == CifsNeedNegotiate &&
shdr->Command != SMB2_NEGOTIATE) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -EAGAIN;
}
+ spin_unlock(&server->srv_lock);
- if (ses->status == CifsNew) {
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_NEW) {
if ((shdr->Command != SMB2_SESSION_SETUP) &&
(shdr->Command != SMB2_NEGOTIATE)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are setting up session */
}
- if (ses->status == CifsExiting) {
+ if (ses->ses_status == SES_EXITING) {
if (shdr->Command != SMB2_LOGOFF) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are shutting down the session */
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
*mid = smb2_mid_entry_alloc(shdr, server);
if (*mid == NULL)
return -ENOMEM;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_add_tail(&(*mid)->qhead, &server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return 0;
}
@@ -853,7 +856,7 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
rc = smb2_sign_rqst(rqst, server);
if (rc) {
revert_current_mid_from_hdr(server, shdr);
- cifs_delete_mid(mid);
+ delete_mid(mid);
return ERR_PTR(rc);
}
@@ -868,13 +871,13 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
(struct smb2_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsNeedNegotiate &&
shdr->Command != SMB2_NEGOTIATE) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return ERR_PTR(-EAGAIN);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
smb2_seq_num_into_buf(server, shdr);
@@ -887,7 +890,7 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
rc = smb2_sign_rqst(rqst, server);
if (rc) {
revert_current_mid_from_hdr(server, shdr);
- DeleteMidQEntry(mid);
+ release_mid(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 31ef64eb7fbb..5fbbec22bcc8 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -649,7 +649,7 @@ static int smbd_ia_open(
smbd_max_frmr_depth,
info->id->device->attrs.max_fast_reg_page_list_len);
info->mr_type = IB_MR_TYPE_MEM_REG;
- if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+ if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
info->mr_type = IB_MR_TYPE_SG_GAPS;
info->pd = ib_alloc_pd(info->id->device, 0);
@@ -1350,7 +1350,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
wait_event(info->wait_send_pending,
atomic_read(&info->send_pending) == 0);
- /* It's not posssible for upper layer to get to reassembly */
+ /* It's not possible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
do {
spin_lock_irqsave(&info->reassembly_queue_lock, flags);
@@ -1382,9 +1382,9 @@ void smbd_destroy(struct TCP_Server_Info *server)
log_rdma_event(INFO, "freeing mr list\n");
wake_up_interruptible_all(&info->wait_mr);
while (atomic_read(&info->mr_used_count)) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
msleep(1000);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
}
destroy_mr_list(info);
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 6cecf302dcfd..6b88dc2e364f 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -121,6 +121,44 @@ DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
DEFINE_SMB3_RW_DONE_EVENT(zero_done);
DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
+/* For logging successful set EOF (truncate) */
+DECLARE_EVENT_CLASS(smb3_eof_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset),
+ TP_ARGS(xid, fid, tid, sesid, offset),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset)
+)
+
+#define DEFINE_SMB3_EOF_EVENT(name) \
+DEFINE_EVENT(smb3_eof_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset), \
+ TP_ARGS(xid, fid, tid, sesid, offset))
+
+DEFINE_SMB3_EOF_EVENT(set_eof);
+
/*
* For handle based calls other than read and write, and get/set info
*/
@@ -158,6 +196,7 @@ DEFINE_SMB3_FD_EVENT(flush_enter);
DEFINE_SMB3_FD_EVENT(flush_done);
DEFINE_SMB3_FD_EVENT(close_enter);
DEFINE_SMB3_FD_EVENT(close_done);
+DEFINE_SMB3_FD_EVENT(oplock_not_found);
DECLARE_EVENT_CLASS(smb3_fd_err_class,
TP_PROTO(unsigned int xid,
@@ -814,6 +853,7 @@ DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \
TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high))
DEFINE_SMB3_LEASE_DONE_EVENT(lease_done);
+DEFINE_SMB3_LEASE_DONE_EVENT(lease_not_found);
DECLARE_EVENT_CLASS(smb3_lease_err_class,
TP_PROTO(__u32 lease_state,
@@ -1006,6 +1046,13 @@ DEFINE_SMB3_CREDIT_EVENT(credit_timeout);
DEFINE_SMB3_CREDIT_EVENT(insufficient_credits);
DEFINE_SMB3_CREDIT_EVENT(too_many_credits);
DEFINE_SMB3_CREDIT_EVENT(add_credits);
+DEFINE_SMB3_CREDIT_EVENT(adj_credits);
+DEFINE_SMB3_CREDIT_EVENT(hdr_credits);
+DEFINE_SMB3_CREDIT_EVENT(nblk_credits);
+DEFINE_SMB3_CREDIT_EVENT(pend_credits);
+DEFINE_SMB3_CREDIT_EVENT(wait_credits);
+DEFINE_SMB3_CREDIT_EVENT(waitff_credits);
+DEFINE_SMB3_CREDIT_EVENT(overflow_credits);
DEFINE_SMB3_CREDIT_EVENT(set_credits);
#endif /* _CIFS_TRACE_H */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index eeb1a699bd6f..9a2753e21170 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -21,6 +21,7 @@
#include <asm/processor.h>
#include <linux/mempool.h>
#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -37,13 +38,13 @@ cifs_wake_up_task(struct mid_q_entry *mid)
wake_up_process(mid->callback_data);
}
-struct mid_q_entry *
-AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
+static struct mid_q_entry *
+alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
{
struct mid_q_entry *temp;
if (server == NULL) {
- cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n");
+ cifs_dbg(VFS, "%s: null TCP session\n", __func__);
return NULL;
}
@@ -68,12 +69,12 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
- atomic_inc(&midCount);
+ atomic_inc(&mid_count);
temp->mid_state = MID_REQUEST_ALLOCATED;
return temp;
}
-static void _cifs_mid_q_entry_release(struct kref *refcount)
+static void __release_mid(struct kref *refcount)
{
struct mid_q_entry *midEntry =
container_of(refcount, struct mid_q_entry, refcount);
@@ -91,7 +92,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
server->ops->handle_cancelled_mid(midEntry, server);
midEntry->mid_state = MID_FREE;
- atomic_dec(&midCount);
+ atomic_dec(&mid_count);
if (midEntry->large_buf)
cifs_buf_release(midEntry->resp_buf);
else
@@ -152,29 +153,26 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
mempool_free(midEntry, cifs_mid_poolp);
}
-void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
+void release_mid(struct mid_q_entry *mid)
{
- spin_lock(&GlobalMid_Lock);
- kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
- spin_unlock(&GlobalMid_Lock);
-}
+ struct TCP_Server_Info *server = mid->server;
-void DeleteMidQEntry(struct mid_q_entry *midEntry)
-{
- cifs_mid_q_entry_release(midEntry);
+ spin_lock(&server->mid_lock);
+ kref_put(&mid->refcount, __release_mid);
+ spin_unlock(&server->mid_lock);
}
void
-cifs_delete_mid(struct mid_q_entry *mid)
+delete_mid(struct mid_q_entry *mid)
{
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&mid->server->mid_lock);
if (!(mid->mid_flags & MID_DELETED)) {
list_del_init(&mid->qhead);
mid->mid_flags |= MID_DELETED;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&mid->server->mid_lock);
- DeleteMidQEntry(mid);
+ release_mid(mid);
}
/*
@@ -196,10 +194,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
*sent = 0;
- smb_msg->msg_name = (struct sockaddr *) &server->dstaddr;
- smb_msg->msg_namelen = sizeof(struct sockaddr);
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
if (server->noblocksnd)
smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
else
@@ -263,8 +257,8 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst)
int nvec;
unsigned long buflen = 0;
- if (server->vals->header_preamble_size == 0 &&
- rqst->rq_nvec >= 2 && rqst->rq_iov[0].iov_len == 4) {
+ if (!is_smb1(server) && rqst->rq_nvec >= 2 &&
+ rqst->rq_iov[0].iov_len == 4) {
iov = &rqst->rq_iov[1];
nvec = rqst->rq_nvec - 1;
} else {
@@ -311,7 +305,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
__be32 rfc1002_marker;
if (cifs_rdma_enabled(server)) {
@@ -348,7 +342,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
sigprocmask(SIG_BLOCK, &mask, &oldmask);
/* Generate a rfc1002 marker for SMB2+ */
- if (server->vals->header_preamble_size == 0) {
+ if (!is_smb1(server)) {
struct kvec hiov = {
.iov_base = &rfc1002_marker,
.iov_len = 4
@@ -464,13 +458,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
return -EIO;
}
- tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS);
+ tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS);
if (!tr_hdr)
return -ENOMEM;
memset(&cur_rqst[0], 0, sizeof(cur_rqst));
memset(&iov, 0, sizeof(iov));
- memset(tr_hdr, 0, sizeof(*tr_hdr));
iov.iov_base = tr_hdr;
iov.iov_len = sizeof(*tr_hdr);
@@ -542,7 +535,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_nblk_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits, -1, in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
__func__, 1, scredits);
@@ -578,12 +571,12 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
} else {
spin_unlock(&server->req_lock);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* For normal commands, reserve the last MAX_COMPOUND
@@ -648,7 +641,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_waitff_credits(server->CurrentMid,
server->conn_id, server->hostname, scredits,
-(num_credits), in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
@@ -726,32 +719,32 @@ cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
struct mid_q_entry **ppmidQ)
{
- spin_lock(&cifs_tcp_ses_lock);
- if (ses->status == CifsNew) {
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_NEW) {
if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
(in_buf->Command != SMB_COM_NEGOTIATE)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are setting up session */
}
- if (ses->status == CifsExiting) {
+ if (ses->ses_status == SES_EXITING) {
/* check if SMB session is bad because we are setting it up */
if (in_buf->Command != SMB_COM_LOGOFF_ANDX) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
return -EAGAIN;
}
/* else ok - we are shutting down session */
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
- *ppmidQ = AllocMidQEntry(in_buf, ses->server);
+ *ppmidQ = alloc_mid(in_buf, ses->server);
if (*ppmidQ == NULL)
return -ENOMEM;
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&ses->server->mid_lock);
list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&ses->server->mid_lock);
return 0;
}
@@ -783,13 +776,13 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
if (server->sign)
hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
- mid = AllocMidQEntry(hdr, server);
+ mid = alloc_mid(hdr, server);
if (mid == NULL)
return ERR_PTR(-ENOMEM);
rc = cifs_sign_rqst(rqst, server, &mid->sequence_number);
if (rc) {
- DeleteMidQEntry(mid);
+ release_mid(mid);
return ERR_PTR(rc);
}
@@ -823,7 +816,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
} else
instance = exist_credits->instance;
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
/*
* We can't use credits obtained from the previous session to send this
@@ -831,14 +824,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
* return -EAGAIN in such cases to let callers handle it.
*/
if (instance != server->reconnect_instance) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
add_credits_and_wake_if(server, &credits, optype);
return -EAGAIN;
}
mid = server->ops->setup_async_request(server, rqst);
if (IS_ERR(mid)) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
add_credits_and_wake_if(server, &credits, optype);
return PTR_ERR(mid);
}
@@ -850,9 +843,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid->mid_state = MID_REQUEST_SUBMITTED;
/* put it on the pending_mid_q */
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
list_add_tail(&mid->qhead, &server->pending_mid_q);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
/*
* Need to store the time in mid before calling I/O. For call_async,
@@ -866,10 +859,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
if (rc < 0) {
revert_current_mid(server, mid->credits);
server->sequence_number -= 2;
- cifs_delete_mid(mid);
+ delete_mid(mid);
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc == 0)
return 0;
@@ -913,10 +906,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
__func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
return rc;
case MID_RETRY_NEEDED:
rc = -EAGAIN;
@@ -936,9 +929,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
__func__, mid->mid, mid->mid_state);
rc = -EIO;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
- DeleteMidQEntry(mid);
+ release_mid(mid);
return rc;
}
@@ -998,7 +991,7 @@ cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored,
return ERR_PTR(rc);
rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number);
if (rc) {
- cifs_delete_mid(mid);
+ delete_mid(mid);
return ERR_PTR(rc);
}
return mid;
@@ -1027,7 +1020,7 @@ static void
cifs_cancelled_callback(struct mid_q_entry *mid)
{
cifs_compound_callback(mid);
- DeleteMidQEntry(mid);
+ release_mid(mid);
}
/*
@@ -1079,12 +1072,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/*
* Wait for all the requests to become available.
@@ -1110,7 +1103,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* of smb data.
*/
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
/*
* All the parts of the compound chain belong obtained credits from the
@@ -1120,7 +1113,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
* handle it.
*/
if (instance != server->reconnect_instance) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
for (j = 0; j < num_rqst; j++)
add_credits(server, &credits[j], optype);
return -EAGAIN;
@@ -1131,8 +1124,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
if (IS_ERR(midQ[i])) {
revert_current_mid(server, i);
for (j = 0; j < i; j++)
- cifs_delete_mid(midQ[j]);
- mutex_unlock(&server->srv_mutex);
+ delete_mid(midQ[j]);
+ cifs_server_unlock(server);
/* Update # of requests on wire to server */
for (j = 0; j < num_rqst; j++)
@@ -1164,7 +1157,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
server->sequence_number -= 2;
}
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
/*
* If sending failed for some reason or it is an oplock break that we
@@ -1187,17 +1180,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
- spin_lock(&cifs_tcp_ses_lock);
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
+ if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
+ spin_unlock(&ses->ses_lock);
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec);
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&ses->ses_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
for (i = 0; i < num_rqst; i++) {
rc = wait_for_response(server, midQ[i]);
@@ -1209,14 +1202,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
}
@@ -1241,7 +1234,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
buf = (char *)midQ[i]->resp_buf;
resp_iov[i].iov_base = buf;
resp_iov[i].iov_len = midQ[i]->resp_buf_size +
- server->vals->header_preamble_size;
+ HEADER_PREAMBLE_SIZE(server);
if (midQ[i]->large_buf)
resp_buf_type[i] = CIFS_LARGE_BUFFER;
@@ -1251,7 +1244,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
rc = server->ops->check_receive(midQ[i], server,
flags & CIFS_LOG_ERROR);
- /* mark it so buf will not be freed by cifs_delete_mid */
+ /* mark it so buf will not be freed by delete_mid */
if ((flags & CIFS_NO_RSP_BUF) == 0)
midQ[i]->resp_buf = NULL;
@@ -1260,19 +1253,19 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
/*
* Compounding is never used during session establish.
*/
- spin_lock(&cifs_tcp_ses_lock);
- if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
+ spin_lock(&ses->ses_lock);
+ if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) {
struct kvec iov = {
.iov_base = resp_iov[0].iov_base,
.iov_len = resp_iov[0].iov_len
};
- spin_unlock(&cifs_tcp_ses_lock);
- mutex_lock(&server->srv_mutex);
+ spin_unlock(&ses->ses_lock);
+ cifs_server_lock(server);
smb311_update_preauth_hash(ses, server, &iov, 1);
- mutex_unlock(&server->srv_mutex);
- spin_lock(&cifs_tcp_ses_lock);
+ cifs_server_unlock(server);
+ spin_lock(&ses->ses_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&ses->ses_lock);
out:
/*
@@ -1283,7 +1276,7 @@ out:
*/
for (i = 0; i < num_rqst; i++) {
if (!cancelled_mid[i])
- cifs_delete_mid(midQ[i]);
+ delete_mid(midQ[i]);
}
return rc;
@@ -1361,12 +1354,12 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/* Ensure that we do not send more than 50 overlapping requests
to the same server. We may make this configurable later or
@@ -1386,11 +1379,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
and avoid races inside tcp sendmsg code that could cause corruption
of smb data */
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = allocate_mid(ses, in_buf, &midQ);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
/* Update # of requests on wire to server */
add_credits(server, &credits, 0);
return rc;
@@ -1398,7 +1391,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
goto out;
}
@@ -1412,7 +1405,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc < 0)
server->sequence_number -= 2;
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc < 0)
goto out;
@@ -1420,15 +1413,15 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = wait_for_response(server, midQ);
if (rc != 0) {
send_cancel(server, &rqst, midQ);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
- midQ->callback = DeleteMidQEntry;
- spin_unlock(&GlobalMid_Lock);
+ midQ->callback = release_mid;
+ spin_unlock(&server->mid_lock);
add_credits(server, &credits, 0);
return rc;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
rc = cifs_sync_mid_result(midQ, server);
@@ -1448,7 +1441,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
rc = cifs_check_receive(midQ, server, 0);
out:
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
add_credits(server, &credits, 0);
return rc;
@@ -1506,12 +1499,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return -EIO;
}
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsExiting) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
return -ENOENT;
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
/* Ensure that we do not send more than 50 overlapping requests
to the same server. We may make this configurable later or
@@ -1531,18 +1524,18 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
and avoid races inside tcp sendmsg code that could cause corruption
of smb data */
- mutex_lock(&server->srv_mutex);
+ cifs_server_lock(server);
rc = allocate_mid(ses, in_buf, &midQ);
if (rc) {
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
return rc;
}
rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
if (rc) {
- cifs_delete_mid(midQ);
- mutex_unlock(&server->srv_mutex);
+ delete_mid(midQ);
+ cifs_server_unlock(server);
return rc;
}
@@ -1555,10 +1548,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (rc < 0)
server->sequence_number -= 2;
- mutex_unlock(&server->srv_mutex);
+ cifs_server_unlock(server);
if (rc < 0) {
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
return rc;
}
@@ -1569,19 +1562,19 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
(server->tcpStatus != CifsNew)));
/* Were we interrupted by a signal ? */
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
if ((rc == -ERESTARTSYS) &&
(midQ->mid_state == MID_REQUEST_SUBMITTED) &&
((server->tcpStatus == CifsGood) ||
(server->tcpStatus == CifsNew))) {
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
if (in_buf->Command == SMB_COM_TRANSACTION2) {
/* POSIX lock. We send a NT_CANCEL SMB to cause the
blocking lock to return. */
rc = send_cancel(server, &rqst, midQ);
if (rc) {
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
return rc;
}
} else {
@@ -1593,7 +1586,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* If we get -ENOLCK back the lock may have
already been removed. Don't exit in this case. */
if (rc && rc != -ENOLCK) {
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
return rc;
}
}
@@ -1601,21 +1594,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
rc = wait_for_response(server, midQ);
if (rc) {
send_cancel(server, &rqst, midQ);
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&server->mid_lock);
if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
/* no longer considered to be "in-flight" */
- midQ->callback = DeleteMidQEntry;
- spin_unlock(&GlobalMid_Lock);
+ midQ->callback = release_mid;
+ spin_unlock(&server->mid_lock);
return rc;
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&server->mid_lock);
}
/* We got the response - restart system call. */
rstart = 1;
- spin_lock(&cifs_tcp_ses_lock);
+ spin_lock(&server->srv_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&server->srv_lock);
rc = cifs_sync_mid_result(midQ, server);
if (rc != 0)
@@ -1632,8 +1625,185 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
rc = cifs_check_receive(midQ, server, 0);
out:
- cifs_delete_mid(midQ);
+ delete_mid(midQ);
if (rstart && rc == -EACCES)
return -ERESTARTSYS;
return rc;
}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+int
+cifs_discard_remaining_data(struct TCP_Server_Info *server)
+{
+ unsigned int rfclen = server->pdu_size;
+ int remaining = rfclen + HEADER_PREAMBLE_SIZE(server) -
+ server->total_read;
+
+ while (remaining > 0) {
+ int length;
+
+ length = cifs_discard_from_socket(server,
+ min_t(size_t, remaining,
+ CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ remaining -= length;
+ }
+
+ return 0;
+}
+
+static int
+__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
+ bool malformed)
+{
+ int length;
+
+ length = cifs_discard_remaining_data(server);
+ dequeue_mid(mid, malformed);
+ mid->resp_buf = server->smallbuf;
+ server->smallbuf = NULL;
+ return length;
+}
+
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ return __cifs_readv_discard(server, mid, rdata->result);
+}
+
+int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length, len;
+ unsigned int data_offset, data_len;
+ struct cifs_readdata *rdata = mid->callback_data;
+ char *buf = server->smallbuf;
+ unsigned int buflen = server->pdu_size + HEADER_PREAMBLE_SIZE(server);
+ bool use_rdma_mr = false;
+
+ cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
+ __func__, mid->mid, rdata->offset, rdata->bytes);
+
+ /*
+ * read the rest of READ_RSP header (sans Data array), or whatever we
+ * can if there's not enough data. At this point, we've read down to
+ * the Mid.
+ */
+ len = min_t(unsigned int, buflen, server->vals->read_rsp_size) -
+ HEADER_SIZE(server) + 1;
+
+ length = cifs_read_from_socket(server,
+ buf + HEADER_SIZE(server) - 1, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+
+ if (server->ops->is_session_expired &&
+ server->ops->is_session_expired(buf)) {
+ cifs_reconnect(server, true);
+ return -1;
+ }
+
+ if (server->ops->is_status_pending &&
+ server->ops->is_status_pending(buf, server)) {
+ cifs_discard_remaining_data(server);
+ return -1;
+ }
+
+ /* set up first two iov for signature check and to get credits */
+ rdata->iov[0].iov_base = buf;
+ rdata->iov[0].iov_len = HEADER_PREAMBLE_SIZE(server);
+ rdata->iov[1].iov_base = buf + HEADER_PREAMBLE_SIZE(server);
+ rdata->iov[1].iov_len =
+ server->total_read - HEADER_PREAMBLE_SIZE(server);
+ cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+ rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+ cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+ rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
+ /* Was the SMB read successful? */
+ rdata->result = server->ops->map_error(buf, false);
+ if (rdata->result != 0) {
+ cifs_dbg(FYI, "%s: server returned error %d\n",
+ __func__, rdata->result);
+ /* normal error on read response */
+ return __cifs_readv_discard(server, mid, false);
+ }
+
+ /* Is there enough to get to the rest of the READ_RSP header? */
+ if (server->total_read < server->vals->read_rsp_size) {
+ cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
+ __func__, server->total_read,
+ server->vals->read_rsp_size);
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ data_offset = server->ops->read_data_offset(buf) +
+ HEADER_PREAMBLE_SIZE(server);
+ if (data_offset < server->total_read) {
+ /*
+ * win2k8 sometimes sends an offset of 0 when the read
+ * is beyond the EOF. Treat it as if the data starts just after
+ * the header.
+ */
+ cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
+ __func__, data_offset);
+ data_offset = server->total_read;
+ } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+ /* data_offset is beyond the end of smallbuf */
+ cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
+ __func__, data_offset);
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
+ __func__, server->total_read, data_offset);
+
+ len = data_offset - server->total_read;
+ if (len > 0) {
+ /* read any junk before data into the rest of smallbuf */
+ length = cifs_read_from_socket(server,
+ buf + server->total_read, len);
+ if (length < 0)
+ return length;
+ server->total_read += length;
+ }
+
+ /* how much data is in the response? */
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ use_rdma_mr = rdata->mr;
+#endif
+ data_len = server->ops->read_data_length(buf, use_rdma_mr);
+ if (!use_rdma_mr && (data_offset + data_len > buflen)) {
+ /* data_len is corrupt -- discard frame */
+ rdata->result = -EIO;
+ return cifs_readv_discard(server, mid);
+ }
+
+ length = rdata->read_into_pages(server, rdata, data_len);
+ if (length < 0)
+ return length;
+
+ server->total_read += length;
+
+ cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
+ server->total_read, buflen, data_len);
+
+ /* discard anything left over */
+ if (server->total_read < buflen)
+ return cifs_readv_discard(server, mid);
+
+ dequeue_mid(mid, false);
+ mid->resp_buf = server->smallbuf;
+ server->smallbuf = NULL;
+ return length;
+}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 9d486fbbfbbd..998fa51f9b68 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -201,6 +201,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
break;
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
case XATTR_ACL_ACCESS:
#ifdef CONFIG_CIFS_POSIX
if (!value)
@@ -224,6 +225,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
cifs_remap(cifs_sb));
#endif /* CONFIG_CIFS_POSIX */
break;
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
out:
@@ -364,7 +366,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
}
break;
}
-
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
case XATTR_ACL_ACCESS:
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & SB_POSIXACL)
@@ -384,6 +386,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
cifs_remap(cifs_sb));
#endif /* CONFIG_CIFS_POSIX */
break;
+#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
/* We could add an additional check for streams ie
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 8907d0508198..ccdbec388091 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -20,29 +20,29 @@
#include "coda_psdev.h"
#include "coda_linux.h"
-static int coda_symlink_filler(struct file *file, struct page *page)
+static int coda_symlink_filler(struct file *file, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
int error;
struct coda_inode_info *cii;
unsigned int len = PAGE_SIZE;
- char *p = page_address(page);
+ char *p = folio_address(folio);
cii = ITOC(inode);
error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
if (error)
goto fail;
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_set_error(folio);
+ folio_unlock(folio);
return error;
}
const struct address_space_operations coda_symlink_aops = {
- .readpage = coda_symlink_filler,
+ .read_folio = coda_symlink_filler,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index ebc43f960b64..7c6bb4e9db20 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -354,7 +354,7 @@ static int zap_process(struct task_struct *start, int exit_code)
struct task_struct *t;
int nr = 0;
- /* ignore all signals except SIGKILL, see prepare_signal() */
+ /* Allow SIGKILL, see prepare_signal() */
start->signal->flags = SIGNAL_GROUP_EXIT;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
@@ -816,9 +816,9 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
{
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
- if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+ if (file->f_mode & FMODE_LSEEK) {
if (dump_interrupted() ||
- file->f_op->llseek(file, nr, SEEK_CUR) < 0)
+ vfs_llseek(file, nr, SEEK_CUR) < 0)
return 0;
cprm->pos += nr;
return 1;
@@ -832,6 +832,39 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
}
}
+static int dump_emit_page(struct coredump_params *cprm, struct page *page)
+{
+ struct bio_vec bvec = {
+ .bv_page = page,
+ .bv_offset = 0,
+ .bv_len = PAGE_SIZE,
+ };
+ struct iov_iter iter;
+ struct file *file = cprm->file;
+ loff_t pos;
+ ssize_t n;
+
+ if (cprm->to_skip) {
+ if (!__dump_skip(cprm, cprm->to_skip))
+ return 0;
+ cprm->to_skip = 0;
+ }
+ if (cprm->written + PAGE_SIZE > cprm->limit)
+ return 0;
+ if (dump_interrupted())
+ return 0;
+ pos = file->f_pos;
+ iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
+ n = __kernel_write_iter(cprm->file, &iter, &pos);
+ if (n != PAGE_SIZE)
+ return 0;
+ file->f_pos = pos;
+ cprm->written += PAGE_SIZE;
+ cprm->pos += PAGE_SIZE;
+
+ return 1;
+}
+
int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
{
if (cprm->to_skip) {
@@ -863,7 +896,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
- int stop;
/*
* To avoid having to allocate page tables for virtual address
@@ -874,10 +906,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
*/
page = get_dump_page(addr);
if (page) {
- void *kaddr = kmap_local_page(page);
-
- stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap_local(kaddr);
+ int stop = !dump_emit_page(cprm, page);
put_page(page);
if (stop)
return 0;
diff --git a/fs/cramfs/README b/fs/cramfs/README
index d71b27e0ff15..778df5c4d70b 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -115,7 +115,7 @@ Block Size
(Block size in cramfs refers to the size of input data that is
compressed at a time. It's intended to be somewhere around
-PAGE_SIZE for cramfs_readpage's convenience.)
+PAGE_SIZE for cramfs_read_folio's convenience.)
The superblock ought to indicate the block size that the fs was
written for, since comments in <linux/pagemap.h> indicate that
@@ -161,7 +161,7 @@ size. The options are:
PAGE_SIZE.
It's easy enough to change the kernel to use a smaller value than
-PAGE_SIZE: just make cramfs_readpage read multiple blocks.
+PAGE_SIZE: just make cramfs_read_folio read multiple blocks.
The cost of option 1 is that kernels with a larger PAGE_SIZE
value don't get as good compression as they can.
@@ -173,9 +173,9 @@ they don't mind their cramfs being inaccessible to kernels with
smaller PAGE_SIZE values.
Option 3 is easy to implement if we don't mind being CPU-inefficient:
-e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which
+e.g. get read_folio to decompress to a buffer of size MAX_BLKSIZE (which
must be no larger than 32KB) and discard what it doesn't need.
-Getting readpage to read into all the covered pages is harder.
+Getting read_folio to read into all the covered pages is harder.
The main advantage of option 3 over 1, 2, is better compression. The
cost is greater complexity. Probably not worth it, but I hope someone
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 666aa380011e..61ccf7722fc3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -183,6 +183,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
unsigned int len)
{
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct file_ra_state ra;
struct page *pages[BLKS_PER_BUF];
unsigned i, blocknr, buffer;
unsigned long devsize;
@@ -212,6 +213,9 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
+ file_ra_state_init(&ra, mapping);
+ page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF);
+
for (i = 0; i < BLKS_PER_BUF; i++) {
struct page *page = NULL;
@@ -224,19 +228,6 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
pages[i] = page;
}
- for (i = 0; i < BLKS_PER_BUF; i++) {
- struct page *page = pages[i];
-
- if (page) {
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- /* asynchronous error */
- put_page(page);
- pages[i] = NULL;
- }
- }
- }
-
buffer = next_buffer;
next_buffer = NEXT_BUFFER(buffer);
buffer_blocknr[buffer] = blocknr;
@@ -414,7 +405,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
/*
* Let's create a mixed map if we can't map it all.
* The normal paging machinery will take care of the
- * unpopulated ptes via cramfs_readpage().
+ * unpopulated ptes via cramfs_read_folio().
*/
int i;
vma->vm_flags |= VM_MIXEDMAP;
@@ -814,8 +805,9 @@ out:
return d_splice_alias(inode, dentry);
}
-static int cramfs_readpage(struct file *file, struct page *page)
+static int cramfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
u32 maxblock;
int bytes_filled;
@@ -925,7 +917,7 @@ err:
}
static const struct address_space_operations cramfs_aops = {
- .readpage = cramfs_readpage
+ .read_folio = cramfs_read_folio
};
/*
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 2217fe5ece6f..1b4403136d05 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -25,21 +25,25 @@
* then this function isn't applicable. This function may sleep, so it must be
* called from a workqueue rather than from the bio's bi_end_io callback.
*
- * This function sets PG_error on any pages that contain any blocks that failed
- * to be decrypted. The filesystem must not mark such pages uptodate.
+ * Return: %true on success; %false on failure. On failure, bio->bi_status is
+ * also set to an error status.
*/
-void fscrypt_decrypt_bio(struct bio *bio)
+bool fscrypt_decrypt_bio(struct bio *bio)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, iter_all) {
struct page *page = bv->bv_page;
- int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
+ int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
bv->bv_offset);
- if (ret)
- SetPageError(page);
+
+ if (err) {
+ bio->bi_status = errno_to_blk_status(err);
+ return false;
+ }
}
+ return true;
}
EXPORT_SYMBOL(fscrypt_decrypt_bio);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 526a4c1bed99..e78be66bbf01 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -113,7 +113,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
if (WARN_ON_ONCE(len <= 0))
return -EINVAL;
- if (WARN_ON_ONCE(len % FS_CRYPTO_BLOCK_SIZE != 0))
+ if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0))
return -EINVAL;
fscrypt_generate_iv(&iv, lblk_num, ci);
@@ -213,8 +213,8 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
* fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place
* @inode: The inode to which this block belongs
* @page: The page containing the block to encrypt
- * @len: Size of block to encrypt. Doesn't need to be a multiple of the
- * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @len: Size of block to encrypt. This must be a multiple of
+ * FSCRYPT_CONTENTS_ALIGNMENT.
* @offs: Byte offset within @page at which the block to encrypt begins
* @lblk_num: Filesystem logical block number of the block, i.e. the 0-based
* number of the block within the file
@@ -283,8 +283,8 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks);
* fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place
* @inode: The inode to which this block belongs
* @page: The page containing the block to decrypt
- * @len: Size of block to decrypt. Doesn't need to be a multiple of the
- * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE.
+ * @len: Size of block to decrypt. This must be a multiple of
+ * FSCRYPT_CONTENTS_ALIGNMENT.
* @offs: Byte offset within @page at which the block to decrypt begins
* @lblk_num: Filesystem logical block number of the block, i.e. the 0-based
* number of the block within the file
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index a9be4bc74a94..12bd61d20f69 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -19,6 +19,13 @@
#include "fscrypt_private.h"
/*
+ * The minimum message length (input and output length), in bytes, for all
+ * filenames encryption modes. Filenames shorter than this will be zero-padded
+ * before being encrypted.
+ */
+#define FSCRYPT_FNAME_MIN_MSG_LEN 16
+
+/*
* struct fscrypt_nokey_name - identifier for directory entry when key is absent
*
* When userspace lists an encrypted directory without access to the key, the
@@ -79,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
/**
* fscrypt_fname_encrypt() - encrypt a filename
* @inode: inode of the parent directory (for regular filenames)
- * or of the symlink (for symlink targets)
+ * or of the symlink (for symlink targets). Key must already be
+ * set up.
* @iname: the filename to encrypt
* @out: (output) the encrypted filename
* @olen: size of the encrypted filename. It must be at least @iname->len.
@@ -130,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
return 0;
}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
/**
* fname_decrypt() - decrypt a filename
@@ -257,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
return bp - dst;
}
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
- u32 orig_len, u32 max_len,
- u32 *encrypted_len_ret)
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret)
{
int padding = 4 << (fscrypt_policy_flags(policy) &
FSCRYPT_POLICY_FLAGS_PAD_MASK);
@@ -267,13 +276,36 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
if (orig_len > max_len)
return false;
- encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE);
+ encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN);
encrypted_len = round_up(encrypted_len, padding);
*encrypted_len_ret = min(encrypted_len, max_len);
return true;
}
/**
+ * fscrypt_fname_encrypted_size() - calculate length of encrypted filename
+ * @inode: parent inode of dentry name being encrypted. Key must
+ * already be set up.
+ * @orig_len: length of the original filename
+ * @max_len: maximum length to return
+ * @encrypted_len_ret: where calculated length should be returned (on success)
+ *
+ * Filenames that are shorter than the maximum length may have their lengths
+ * increased slightly by encryption, due to padding that is applied.
+ *
+ * Return: false if the orig_len is greater than max_len. Otherwise, true and
+ * fill out encrypted_len_ret with the length (up to max_len).
+ */
+bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
+ u32 max_len, u32 *encrypted_len_ret)
+{
+ return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
+ orig_len, max_len,
+ encrypted_len_ret);
+}
+EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);
+
+/**
* fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
* @max_encrypted_len: maximum length of encrypted filenames the buffer will be
* used to present
@@ -350,7 +382,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
return 0;
}
- if (iname->len < FS_CRYPTO_BLOCK_SIZE)
+ if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN)
return -EUCLEAN;
if (fscrypt_has_encryption_key(inode))
@@ -428,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return ret;
if (fscrypt_has_encryption_key(dir)) {
- if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
- iname->len, NAME_MAX,
+ if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 5b0a9e6478b5..d5f68a0c5d15 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -31,7 +31,7 @@
#define FSCRYPT_CONTEXT_V2 2
/* Keep this in sync with include/uapi/linux/fscrypt.h */
-#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM
+#define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2
struct fscrypt_context_v1 {
u8 version; /* FSCRYPT_CONTEXT_V1 */
@@ -184,7 +184,7 @@ struct fscrypt_symlink_data {
struct fscrypt_prepared_key {
struct crypto_skcipher *tfm;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
- struct fscrypt_blk_crypto_key *blk_key;
+ struct blk_crypto_key *blk_key;
#endif
};
@@ -225,7 +225,7 @@ struct fscrypt_info {
* will be NULL if the master key was found in a process-subscribed
* keyring rather than in the filesystem-level keyring.
*/
- struct key *ci_master_key;
+ struct fscrypt_master_key *ci_master_key;
/*
* Link in list of inodes that were unlocked with the master key.
@@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci);
/* fname.c */
-int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
- u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
- u32 orig_len, u32 max_len,
- u32 *encrypted_len_ret);
+bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
/* hkdf.c */
-
struct fscrypt_hkdf {
struct crypto_shash *hmac_tfm;
};
@@ -347,7 +344,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key,
const struct fscrypt_info *ci);
-void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key);
+void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key);
/*
* Check whether the crypto transform or blk-crypto key has been allocated in
@@ -393,7 +391,8 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
}
static inline void
-fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key)
{
}
@@ -440,6 +439,40 @@ struct fscrypt_master_key_secret {
struct fscrypt_master_key {
/*
+ * Back-pointer to the super_block of the filesystem to which this
+ * master key has been added. Only valid if ->mk_active_refs > 0.
+ */
+ struct super_block *mk_sb;
+
+ /*
+ * Link in ->mk_sb->s_master_keys->key_hashtable.
+ * Only valid if ->mk_active_refs > 0.
+ */
+ struct hlist_node mk_node;
+
+ /* Semaphore that protects ->mk_secret and ->mk_users */
+ struct rw_semaphore mk_sem;
+
+ /*
+ * Active and structural reference counts. An active ref guarantees
+ * that the struct continues to exist, continues to be in the keyring
+ * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g.
+ * ->mk_direct_keys) that have been prepared continue to exist.
+ * A structural ref only guarantees that the struct continues to exist.
+ *
+ * There is one active ref associated with ->mk_secret being present,
+ * and one active ref for each inode in ->mk_decrypted_inodes.
+ *
+ * There is one structural ref associated with the active refcount being
+ * nonzero. Finding a key in the keyring also takes a structural ref,
+ * which is then held temporarily while the key is operated on.
+ */
+ refcount_t mk_active_refs;
+ refcount_t mk_struct_refs;
+
+ struct rcu_head mk_rcu_head;
+
+ /*
* The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is
* executed, this is wiped and no new inodes can be unlocked with this
* key; however, there may still be inodes in ->mk_decrypted_inodes
@@ -447,7 +480,10 @@ struct fscrypt_master_key {
* FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
* FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
*
- * Locking: protected by this master key's key->sem.
+ * While ->mk_secret is present, one ref in ->mk_active_refs is held.
+ *
+ * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs
+ * associated with this field is protected by ->mk_sem as well.
*/
struct fscrypt_master_key_secret mk_secret;
@@ -468,23 +504,13 @@ struct fscrypt_master_key {
*
* This is NULL for v1 policy keys; those can only be added by root.
*
- * Locking: in addition to this keyring's own semaphore, this is
- * protected by this master key's key->sem, so we can do atomic
- * search+insert. It can also be searched without taking any locks, but
- * in that case the returned key may have already been removed.
+ * Locking: protected by ->mk_sem. (We don't just rely on the keyrings
+ * subsystem semaphore ->mk_users->sem, as we need support for atomic
+ * search+insert along with proper synchronization with ->mk_secret.)
*/
struct key *mk_users;
/*
- * Length of ->mk_decrypted_inodes, plus one if mk_secret is present.
- * Once this goes to 0, the master key is removed from ->s_master_keys.
- * The 'struct fscrypt_master_key' will continue to live as long as the
- * 'struct key' whose payload it is, but we won't let this reference
- * count rise again.
- */
- refcount_t mk_refcount;
-
- /*
* List of inodes that were unlocked using this key. This allows the
* inodes to be evicted efficiently if the key is removed.
*/
@@ -509,10 +535,10 @@ static inline bool
is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
{
/*
- * The READ_ONCE() is only necessary for fscrypt_drop_inode() and
- * fscrypt_key_describe(). These run in atomic context, so they can't
- * take the key semaphore and thus 'secret' can change concurrently
- * which would be a data race. But they only need to know whether the
+ * The READ_ONCE() is only necessary for fscrypt_drop_inode().
+ * fscrypt_drop_inode() runs in atomic context, so it can't take the key
+ * semaphore and thus 'secret' can change concurrently which would be a
+ * data race. But fscrypt_drop_inode() only need to know whether the
* secret *was* present at the time of check, so READ_ONCE() suffices.
*/
return READ_ONCE(secret->size) != 0;
@@ -541,12 +567,16 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
return 0;
}
-struct key *
+void fscrypt_put_master_key(struct fscrypt_master_key *mk);
+
+void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk);
+
+struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec);
-int fscrypt_add_test_dummy_key(struct super_block *sb,
- struct fscrypt_key_specifier *key_spec);
+int fscrypt_get_test_dummy_key_identifier(
+ u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
int fscrypt_verify_key_added(struct super_block *sb,
const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);
@@ -561,7 +591,9 @@ struct fscrypt_mode {
int keysize; /* key size in bytes */
int security_strength; /* security strength in bytes */
int ivsize; /* IV size in bytes */
- int logged_impl_name;
+ int logged_cryptoapi_impl;
+ int logged_blk_crypto_native;
+ int logged_blk_crypto_fallback;
enum blk_crypto_mode_num blk_crypto_mode;
};
@@ -570,7 +602,8 @@ extern struct fscrypt_mode fscrypt_modes[];
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_info *ci);
-void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key);
+void fscrypt_destroy_prepared_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key);
int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
@@ -621,6 +654,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci);
bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
const union fscrypt_policy *policy2);
+int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy,
+ struct fscrypt_key_specifier *key_spec);
bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
const struct inode *inode);
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index af74599ae1cf..7b8c5a1104b5 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,8 +5,6 @@
* Encryption hooks for higher-level filesystem operations.
*/
-#include <linux/key.h>
-
#include "fscrypt_private.h"
/**
@@ -142,7 +140,6 @@ int fscrypt_prepare_setflags(struct inode *inode,
unsigned int oldflags, unsigned int flags)
{
struct fscrypt_info *ci;
- struct key *key;
struct fscrypt_master_key *mk;
int err;
@@ -158,14 +155,13 @@ int fscrypt_prepare_setflags(struct inode *inode,
ci = inode->i_crypt_info;
if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
return -EINVAL;
- key = ci->ci_master_key;
- mk = key->payload.data[0];
- down_read(&key->sem);
+ mk = ci->ci_master_key;
+ down_read(&mk->mk_sem);
if (is_master_key_secret_present(&mk->mk_secret))
err = fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
- up_read(&key->sem);
+ up_read(&mk->mk_sem);
return err;
}
return 0;
@@ -228,9 +224,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target,
* counting it (even though it is meaningless for ciphertext) is simpler
* for now since filesystems will assume it is there and subtract it.
*/
- if (!fscrypt_fname_encrypted_size(policy, len,
- max_len - sizeof(struct fscrypt_symlink_data),
- &disk_link->len))
+ if (!__fscrypt_fname_encrypted_size(policy, len,
+ max_len - sizeof(struct fscrypt_symlink_data),
+ &disk_link->len))
return -ENAMETOOLONG;
disk_link->len += sizeof(struct fscrypt_symlink_data);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 93c2ca858092..cea8b14007e6 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
* provides the key and IV to use.
*/
-#include <linux/blk-crypto.h>
+#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
@@ -21,26 +21,22 @@
#include "fscrypt_private.h"
-struct fscrypt_blk_crypto_key {
- struct blk_crypto_key base;
- int num_devs;
- struct request_queue *devs[];
-};
-
-static int fscrypt_get_num_devices(struct super_block *sb)
+static struct block_device **fscrypt_get_devices(struct super_block *sb,
+ unsigned int *num_devs)
{
- if (sb->s_cop->get_num_devices)
- return sb->s_cop->get_num_devices(sb);
- return 1;
-}
+ struct block_device **devs;
-static void fscrypt_get_devices(struct super_block *sb, int num_devs,
- struct request_queue **devs)
-{
- if (num_devs == 1)
- devs[0] = bdev_get_queue(sb->s_bdev);
- else
- sb->s_cop->get_devices(sb, devs);
+ if (sb->s_cop->get_devices) {
+ devs = sb->s_cop->get_devices(sb, num_devs);
+ if (devs)
+ return devs;
+ }
+ devs = kmalloc(sizeof(*devs), GFP_KERNEL);
+ if (!devs)
+ return ERR_PTR(-ENOMEM);
+ devs[0] = sb->s_bdev;
+ *num_devs = 1;
+ return devs;
}
static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
@@ -64,15 +60,46 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
return DIV_ROUND_UP(lblk_bits, 8);
}
+/*
+ * Log a message when starting to use blk-crypto (native) or blk-crypto-fallback
+ * for an encryption mode for the first time. This is the blk-crypto
+ * counterpart to the message logged when starting to use the crypto API for the
+ * first time. A limitation is that these messages don't convey which specific
+ * filesystems or files are using each implementation. However, *usually*
+ * systems use just one implementation per mode, which makes these messages
+ * helpful for debugging problems where the "wrong" implementation is used.
+ */
+static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
+ struct block_device **devs,
+ unsigned int num_devs,
+ const struct blk_crypto_config *cfg)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_devs; i++) {
+ struct request_queue *q = bdev_get_queue(devs[i]);
+
+ if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
+ __blk_crypto_cfg_supported(q->crypto_profile, cfg)) {
+ if (!xchg(&mode->logged_blk_crypto_native, 1))
+ pr_info("fscrypt: %s using blk-crypto (native)\n",
+ mode->friendly_name);
+ } else if (!xchg(&mode->logged_blk_crypto_fallback, 1)) {
+ pr_info("fscrypt: %s using blk-crypto-fallback\n",
+ mode->friendly_name);
+ }
+ }
+}
+
/* Enable inline encryption for this file if supported. */
int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
struct blk_crypto_config crypto_cfg;
- int num_devs;
- struct request_queue **devs;
- int i;
+ struct block_device **devs;
+ unsigned int num_devs;
+ unsigned int i;
/* The file must need contents encryption, not filenames encryption */
if (!S_ISREG(inode->i_mode))
@@ -100,23 +127,25 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
return 0;
/*
- * On all the filesystem's devices, blk-crypto must support the crypto
- * configuration that the file would use.
+ * On all the filesystem's block devices, blk-crypto must support the
+ * crypto configuration that the file would use.
*/
crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
crypto_cfg.data_unit_size = sb->s_blocksize;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
- num_devs = fscrypt_get_num_devices(sb);
- devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
- if (!devs)
- return -ENOMEM;
- fscrypt_get_devices(sb, num_devs, devs);
+
+ devs = fscrypt_get_devices(sb, &num_devs);
+ if (IS_ERR(devs))
+ return PTR_ERR(devs);
for (i = 0; i < num_devs; i++) {
- if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
+ if (!blk_crypto_config_supported(bdev_get_queue(devs[i]),
+ &crypto_cfg))
goto out_free_devs;
}
+ fscrypt_log_blk_crypto_impl(ci->ci_mode, devs, num_devs, &crypto_cfg);
+
ci->ci_inlinecrypt = true;
out_free_devs:
kfree(devs);
@@ -131,49 +160,41 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
- int num_devs = fscrypt_get_num_devices(sb);
- int queue_refs = 0;
- struct fscrypt_blk_crypto_key *blk_key;
+ struct blk_crypto_key *blk_key;
+ struct block_device **devs;
+ unsigned int num_devs;
+ unsigned int i;
int err;
- int i;
- blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL);
+ blk_key = kmalloc(sizeof(*blk_key), GFP_KERNEL);
if (!blk_key)
return -ENOMEM;
- blk_key->num_devs = num_devs;
- fscrypt_get_devices(sb, num_devs, blk_key->devs);
-
- err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode,
+ err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
fscrypt_get_dun_bytes(ci), sb->s_blocksize);
if (err) {
fscrypt_err(inode, "error %d initializing blk-crypto key", err);
goto fail;
}
- /*
- * We have to start using blk-crypto on all the filesystem's devices.
- * We also have to save all the request_queue's for later so that the
- * key can be evicted from them. This is needed because some keys
- * aren't destroyed until after the filesystem was already unmounted
- * (namely, the per-mode keys in struct fscrypt_master_key).
- */
+ /* Start using blk-crypto on all the filesystem's block devices. */
+ devs = fscrypt_get_devices(sb, &num_devs);
+ if (IS_ERR(devs)) {
+ err = PTR_ERR(devs);
+ goto fail;
+ }
for (i = 0; i < num_devs; i++) {
- if (!blk_get_queue(blk_key->devs[i])) {
- fscrypt_err(inode, "couldn't get request_queue");
- err = -EAGAIN;
- goto fail;
- }
- queue_refs++;
-
- err = blk_crypto_start_using_key(&blk_key->base,
- blk_key->devs[i]);
- if (err) {
- fscrypt_err(inode,
- "error %d starting to use blk-crypto", err);
- goto fail;
- }
+ err = blk_crypto_start_using_key(blk_key,
+ bdev_get_queue(devs[i]));
+ if (err)
+ break;
+ }
+ kfree(devs);
+ if (err) {
+ fscrypt_err(inode, "error %d starting to use blk-crypto", err);
+ goto fail;
}
+
/*
* Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
* I.e., here we publish ->blk_key with a RELEASE barrier so that
@@ -184,24 +205,29 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
return 0;
fail:
- for (i = 0; i < queue_refs; i++)
- blk_put_queue(blk_key->devs[i]);
kfree_sensitive(blk_key);
return err;
}
-void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key)
{
- struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key;
- int i;
+ struct blk_crypto_key *blk_key = prep_key->blk_key;
+ struct block_device **devs;
+ unsigned int num_devs;
+ unsigned int i;
- if (blk_key) {
- for (i = 0; i < blk_key->num_devs; i++) {
- blk_crypto_evict_key(blk_key->devs[i], &blk_key->base);
- blk_put_queue(blk_key->devs[i]);
- }
- kfree_sensitive(blk_key);
+ if (!blk_key)
+ return;
+
+ /* Evict the key from all the filesystem's block devices. */
+ devs = fscrypt_get_devices(sb, &num_devs);
+ if (!IS_ERR(devs)) {
+ for (i = 0; i < num_devs; i++)
+ blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key);
+ kfree(devs);
}
+ kfree_sensitive(blk_key);
}
bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
@@ -251,7 +277,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
ci = inode->i_crypt_info;
fscrypt_generate_dun(ci, first_lblk, dun);
- bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask);
+ bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
}
EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx);
@@ -338,7 +364,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
* uses the same pointer. I.e., there's currently no need to support
* merging requests where the keys are the same but the pointers differ.
*/
- if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base)
+ if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key)
return false;
fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
@@ -370,46 +396,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
/**
- * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
- * supported as far as encryption is concerned
- * @iocb: the file and position the I/O is targeting
- * @iter: the I/O data segment(s)
+ * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
+ * inode, as far as encryption is concerned
+ * @inode: the inode in question
*
* Return: %true if there are no encryption constraints that prevent DIO from
* being supported; %false if DIO is unsupported. (Note that in the
* %true case, the filesystem might have other, non-encryption-related
- * constraints that prevent DIO from actually being supported.)
+ * constraints that prevent DIO from actually being supported. Also, on
+ * encrypted files the filesystem is still responsible for only allowing
+ * DIO when requests are filesystem-block-aligned.)
*/
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+bool fscrypt_dio_supported(struct inode *inode)
{
- const struct inode *inode = file_inode(iocb->ki_filp);
- const unsigned int blocksize = i_blocksize(inode);
+ int err;
/* If the file is unencrypted, no veto from us. */
if (!fscrypt_needs_contents_encryption(inode))
return true;
- /* We only support DIO with inline crypto, not fs-layer crypto. */
- if (!fscrypt_inode_uses_inline_crypto(inode))
- return false;
-
/*
- * Since the granularity of encryption is filesystem blocks, the file
- * position and total I/O length must be aligned to the filesystem block
- * size -- not just to the block device's logical block size as is
- * traditionally the case for DIO on many filesystems.
+ * We only support DIO with inline crypto, not fs-layer crypto.
*
- * We require that the user-provided memory buffers be filesystem block
- * aligned too. It is simpler to have a single alignment value required
- * for all properties of the I/O, as is normally the case for DIO.
- * Also, allowing less aligned buffers would imply that data units could
- * cross bvecs, which would greatly complicate the I/O stack, which
- * assumes that bios can be split at any bvec boundary.
+ * To determine whether the inode is using inline crypto, we have to set
+ * up the key if it wasn't already done. This is because in the current
+ * design of fscrypt, the decision of whether to use inline crypto or
+ * not isn't made until the inode's encryption key is being set up. In
+ * the DIO read/write case, the key will always be set up already, since
+ * the file will be open. But in the case of statx(), the key might not
+ * be set up yet, as the file might not have been opened yet.
*/
- if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ err = fscrypt_require_key(inode);
+ if (err) {
+ /*
+ * Key unavailable or couldn't be set up. This edge case isn't
+ * worth worrying about; just report that DIO is unsupported.
+ */
return false;
-
- return true;
+ }
+ return fscrypt_inode_uses_inline_crypto(inode);
}
EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 0b3ffbb4faf4..1cca09aa43f8 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -18,6 +18,7 @@
* information about these ioctls.
*/
+#include <asm/unaligned.h>
#include <crypto/skcipher.h>
#include <linux/key-type.h>
#include <linux/random.h>
@@ -25,6 +26,18 @@
#include "fscrypt_private.h"
+/* The master encryption keys for a filesystem (->s_master_keys) */
+struct fscrypt_keyring {
+ /*
+ * Lock that protects ->key_hashtable. It does *not* protect the
+ * fscrypt_master_key structs themselves.
+ */
+ spinlock_t lock;
+
+ /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */
+ struct hlist_head key_hashtable[128];
+};
+
static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
{
fscrypt_destroy_hkdf(&secret->hkdf);
@@ -38,66 +51,81 @@ static void move_master_key_secret(struct fscrypt_master_key_secret *dst,
memzero_explicit(src, sizeof(*src));
}
-static void free_master_key(struct fscrypt_master_key *mk)
+static void fscrypt_free_master_key(struct rcu_head *head)
{
- size_t i;
-
- wipe_master_key_secret(&mk->mk_secret);
-
- for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
- fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
- fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
- fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
- }
-
- key_put(mk->mk_users);
+ struct fscrypt_master_key *mk =
+ container_of(head, struct fscrypt_master_key, mk_rcu_head);
+ /*
+ * The master key secret and any embedded subkeys should have already
+ * been wiped when the last active reference to the fscrypt_master_key
+ * struct was dropped; doing it here would be unnecessarily late.
+ * Nevertheless, use kfree_sensitive() in case anything was missed.
+ */
kfree_sensitive(mk);
}
-static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
+void fscrypt_put_master_key(struct fscrypt_master_key *mk)
{
- if (spec->__reserved)
- return false;
- return master_key_spec_len(spec) != 0;
+ if (!refcount_dec_and_test(&mk->mk_struct_refs))
+ return;
+ /*
+ * No structural references left, so free ->mk_users, and also free the
+ * fscrypt_master_key struct itself after an RCU grace period ensures
+ * that concurrent keyring lookups can no longer find it.
+ */
+ WARN_ON(refcount_read(&mk->mk_active_refs) != 0);
+ key_put(mk->mk_users);
+ mk->mk_users = NULL;
+ call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
}
-static int fscrypt_key_instantiate(struct key *key,
- struct key_preparsed_payload *prep)
+void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
{
- key->payload.data[0] = (struct fscrypt_master_key *)prep->data;
- return 0;
-}
+ struct super_block *sb = mk->mk_sb;
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
+ size_t i;
-static void fscrypt_key_destroy(struct key *key)
-{
- free_master_key(key->payload.data[0]);
-}
+ if (!refcount_dec_and_test(&mk->mk_active_refs))
+ return;
+ /*
+ * No active references left, so complete the full removal of this
+ * fscrypt_master_key struct by removing it from the keyring and
+ * destroying any subkeys embedded in it.
+ */
-static void fscrypt_key_describe(const struct key *key, struct seq_file *m)
-{
- seq_puts(m, key->description);
+ spin_lock(&keyring->lock);
+ hlist_del_rcu(&mk->mk_node);
+ spin_unlock(&keyring->lock);
- if (key_is_positive(key)) {
- const struct fscrypt_master_key *mk = key->payload.data[0];
+ /*
+ * ->mk_active_refs == 0 implies that ->mk_secret is not present and
+ * that ->mk_decrypted_inodes is empty.
+ */
+ WARN_ON(is_master_key_secret_present(&mk->mk_secret));
+ WARN_ON(!list_empty(&mk->mk_decrypted_inodes));
- if (!is_master_key_secret_present(&mk->mk_secret))
- seq_puts(m, ": secret removed");
+ for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
+ fscrypt_destroy_prepared_key(
+ sb, &mk->mk_direct_keys[i]);
+ fscrypt_destroy_prepared_key(
+ sb, &mk->mk_iv_ino_lblk_64_keys[i]);
+ fscrypt_destroy_prepared_key(
+ sb, &mk->mk_iv_ino_lblk_32_keys[i]);
}
+ memzero_explicit(&mk->mk_ino_hash_key,
+ sizeof(mk->mk_ino_hash_key));
+ mk->mk_ino_hash_key_initialized = false;
+
+ /* Drop the structural ref associated with the active refs. */
+ fscrypt_put_master_key(mk);
}
-/*
- * Type of key in ->s_master_keys. Each key of this type represents a master
- * key which has been added to the filesystem. Its payload is a
- * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents
- * users from adding keys of this type via the keyrings syscalls rather than via
- * the intended method of FS_IOC_ADD_ENCRYPTION_KEY.
- */
-static struct key_type key_type_fscrypt = {
- .name = "._fscrypt",
- .instantiate = fscrypt_key_instantiate,
- .destroy = fscrypt_key_destroy,
- .describe = fscrypt_key_describe,
-};
+static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
+{
+ if (spec->__reserved)
+ return false;
+ return master_key_spec_len(spec) != 0;
+}
static int fscrypt_user_key_instantiate(struct key *key,
struct key_preparsed_payload *prep)
@@ -131,32 +159,6 @@ static struct key_type key_type_fscrypt_user = {
.describe = fscrypt_user_key_describe,
};
-/* Search ->s_master_keys or ->mk_users */
-static struct key *search_fscrypt_keyring(struct key *keyring,
- struct key_type *type,
- const char *description)
-{
- /*
- * We need to mark the keyring reference as "possessed" so that we
- * acquire permission to search it, via the KEY_POS_SEARCH permission.
- */
- key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
-
- keyref = keyring_search(keyref, type, description, false);
- if (IS_ERR(keyref)) {
- if (PTR_ERR(keyref) == -EAGAIN || /* not found */
- PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
- keyref = ERR_PTR(-ENOKEY);
- return ERR_CAST(keyref);
- }
- return key_ref_to_ptr(keyref);
-}
-
-#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \
- (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id))
-
-#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1)
-
#define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \
(CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \
CONST_STRLEN("-users") + 1)
@@ -164,21 +166,6 @@ static struct key *search_fscrypt_keyring(struct key *keyring,
#define FSCRYPT_MK_USER_DESCRIPTION_SIZE \
(2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1)
-static void format_fs_keyring_description(
- char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
- const struct super_block *sb)
-{
- sprintf(description, "fscrypt-%s", sb->s_id);
-}
-
-static void format_mk_description(
- char description[FSCRYPT_MK_DESCRIPTION_SIZE],
- const struct fscrypt_key_specifier *mk_spec)
-{
- sprintf(description, "%*phN",
- master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
-}
-
static void format_mk_users_keyring_description(
char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE],
const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
@@ -199,20 +186,15 @@ static void format_mk_user_description(
/* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */
static int allocate_filesystem_keyring(struct super_block *sb)
{
- char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
- struct key *keyring;
+ struct fscrypt_keyring *keyring;
if (sb->s_master_keys)
return 0;
- format_fs_keyring_description(description, sb);
- keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
- current_cred(), KEY_POS_SEARCH |
- KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
- KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
- if (IS_ERR(keyring))
- return PTR_ERR(keyring);
-
+ keyring = kzalloc(sizeof(*keyring), GFP_KERNEL);
+ if (!keyring)
+ return -ENOMEM;
+ spin_lock_init(&keyring->lock);
/*
* Pairs with the smp_load_acquire() in fscrypt_find_master_key().
* I.e., here we publish ->s_master_keys with a RELEASE barrier so that
@@ -222,21 +204,75 @@ static int allocate_filesystem_keyring(struct super_block *sb)
return 0;
}
-void fscrypt_sb_free(struct super_block *sb)
+/*
+ * This is called at unmount time to release all encryption keys that have been
+ * added to the filesystem, along with the keyring that contains them.
+ *
+ * Note that besides clearing and freeing memory, this might need to evict keys
+ * from the keyslots of an inline crypto engine. Therefore, this must be called
+ * while the filesystem's underlying block device(s) are still available.
+ */
+void fscrypt_sb_delete(struct super_block *sb)
{
- key_put(sb->s_master_keys);
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
+ size_t i;
+
+ if (!keyring)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) {
+ struct hlist_head *bucket = &keyring->key_hashtable[i];
+ struct fscrypt_master_key *mk;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) {
+ /*
+ * Since all inodes were already evicted, every key
+ * remaining in the keyring should have an empty inode
+ * list, and should only still be in the keyring due to
+ * the single active ref associated with ->mk_secret.
+ * There should be no structural refs beyond the one
+ * associated with the active ref.
+ */
+ WARN_ON(refcount_read(&mk->mk_active_refs) != 1);
+ WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
+ WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
+ wipe_master_key_secret(&mk->mk_secret);
+ fscrypt_put_master_key_activeref(mk);
+ }
+ }
+ kfree_sensitive(keyring);
sb->s_master_keys = NULL;
}
+static struct hlist_head *
+fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring,
+ const struct fscrypt_key_specifier *mk_spec)
+{
+ /*
+ * Since key specifiers should be "random" values, it is sufficient to
+ * use a trivial hash function that just takes the first several bits of
+ * the key specifier.
+ */
+ unsigned long i = get_unaligned((unsigned long *)&mk_spec->u);
+
+ return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)];
+}
+
/*
- * Find the specified master key in ->s_master_keys.
- * Returns ERR_PTR(-ENOKEY) if not found.
+ * Find the specified master key struct in ->s_master_keys and take a structural
+ * ref to it. The structural ref guarantees that the key struct continues to
+ * exist, but it does *not* guarantee that ->s_master_keys continues to contain
+ * the key struct. The structural ref needs to be dropped by
+ * fscrypt_put_master_key(). Returns NULL if the key struct is not found.
*/
-struct key *fscrypt_find_master_key(struct super_block *sb,
- const struct fscrypt_key_specifier *mk_spec)
+struct fscrypt_master_key *
+fscrypt_find_master_key(struct super_block *sb,
+ const struct fscrypt_key_specifier *mk_spec)
{
- struct key *keyring;
- char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+ struct fscrypt_keyring *keyring;
+ struct hlist_head *bucket;
+ struct fscrypt_master_key *mk;
/*
* Pairs with the smp_store_release() in allocate_filesystem_keyring().
@@ -246,10 +282,38 @@ struct key *fscrypt_find_master_key(struct super_block *sb,
*/
keyring = smp_load_acquire(&sb->s_master_keys);
if (keyring == NULL)
- return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
-
- format_mk_description(description, mk_spec);
- return search_fscrypt_keyring(keyring, &key_type_fscrypt, description);
+ return NULL; /* No keyring yet, so no keys yet. */
+
+ bucket = fscrypt_mk_hash_bucket(keyring, mk_spec);
+ rcu_read_lock();
+ switch (mk_spec->type) {
+ case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+ hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+ if (mk->mk_spec.type ==
+ FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+ memcmp(mk->mk_spec.u.descriptor,
+ mk_spec->u.descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
+ refcount_inc_not_zero(&mk->mk_struct_refs))
+ goto out;
+ }
+ break;
+ case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+ hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+ if (mk->mk_spec.type ==
+ FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER &&
+ memcmp(mk->mk_spec.u.identifier,
+ mk_spec->u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 &&
+ refcount_inc_not_zero(&mk->mk_struct_refs))
+ goto out;
+ }
+ break;
+ }
+ mk = NULL;
+out:
+ rcu_read_unlock();
+ return mk;
}
static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
@@ -277,17 +341,30 @@ static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
static struct key *find_master_key_user(struct fscrypt_master_key *mk)
{
char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+ key_ref_t keyref;
format_mk_user_description(description, mk->mk_spec.u.identifier);
- return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user,
- description);
+
+ /*
+ * We need to mark the keyring reference as "possessed" so that we
+ * acquire permission to search it, via the KEY_POS_SEARCH permission.
+ */
+ keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/),
+ &key_type_fscrypt_user, description, false);
+ if (IS_ERR(keyref)) {
+ if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+ keyref = ERR_PTR(-ENOKEY);
+ return ERR_CAST(keyref);
+ }
+ return key_ref_to_ptr(keyref);
}
/*
* Give the current user a "key" in ->mk_users. This charges the user's quota
* and marks the master key as added by the current user, so that it cannot be
- * removed by another user with the key. Either the master key's key->sem must
- * be held for write, or the master key must be still undergoing initialization.
+ * removed by another user with the key. Either ->mk_sem must be held for
+ * write, or the master key must be still undergoing initialization.
*/
static int add_master_key_user(struct fscrypt_master_key *mk)
{
@@ -309,7 +386,7 @@ static int add_master_key_user(struct fscrypt_master_key *mk)
/*
* Remove the current user's "key" from ->mk_users.
- * The master key's key->sem must be held for write.
+ * ->mk_sem must be held for write.
*
* Returns 0 if removed, -ENOKEY if not found, or another -errno code.
*/
@@ -327,63 +404,49 @@ static int remove_master_key_user(struct fscrypt_master_key *mk)
}
/*
- * Allocate a new fscrypt_master_key which contains the given secret, set it as
- * the payload of a new 'struct key' of type fscrypt, and link the 'struct key'
- * into the given keyring. Synchronized by fscrypt_add_key_mutex.
+ * Allocate a new fscrypt_master_key, transfer the given secret over to it, and
+ * insert it into sb->s_master_keys.
*/
-static int add_new_master_key(struct fscrypt_master_key_secret *secret,
- const struct fscrypt_key_specifier *mk_spec,
- struct key *keyring)
+static int add_new_master_key(struct super_block *sb,
+ struct fscrypt_master_key_secret *secret,
+ const struct fscrypt_key_specifier *mk_spec)
{
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
struct fscrypt_master_key *mk;
- char description[FSCRYPT_MK_DESCRIPTION_SIZE];
- struct key *key;
int err;
mk = kzalloc(sizeof(*mk), GFP_KERNEL);
if (!mk)
return -ENOMEM;
+ mk->mk_sb = sb;
+ init_rwsem(&mk->mk_sem);
+ refcount_set(&mk->mk_struct_refs, 1);
mk->mk_spec = *mk_spec;
- move_master_key_secret(&mk->mk_secret, secret);
-
- refcount_set(&mk->mk_refcount, 1); /* secret is present */
INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
spin_lock_init(&mk->mk_decrypted_inodes_lock);
if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
err = allocate_master_key_users_keyring(mk);
if (err)
- goto out_free_mk;
+ goto out_put;
err = add_master_key_user(mk);
if (err)
- goto out_free_mk;
+ goto out_put;
}
- /*
- * Note that we don't charge this key to anyone's quota, since when
- * ->mk_users is in use those keys are charged instead, and otherwise
- * (when ->mk_users isn't in use) only root can add these keys.
- */
- format_mk_description(description, mk_spec);
- key = key_alloc(&key_type_fscrypt, description,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
- KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto out_free_mk;
- }
- err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
- key_put(key);
- if (err)
- goto out_free_mk;
+ move_master_key_secret(&mk->mk_secret, secret);
+ refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
+ spin_lock(&keyring->lock);
+ hlist_add_head_rcu(&mk->mk_node,
+ fscrypt_mk_hash_bucket(keyring, mk_spec));
+ spin_unlock(&keyring->lock);
return 0;
-out_free_mk:
- free_master_key(mk);
+out_put:
+ fscrypt_put_master_key(mk);
return err;
}
@@ -392,42 +455,34 @@ out_free_mk:
static int add_existing_master_key(struct fscrypt_master_key *mk,
struct fscrypt_master_key_secret *secret)
{
- struct key *mk_user;
- bool rekey;
int err;
/*
* If the current user is already in ->mk_users, then there's nothing to
- * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.)
+ * do. Otherwise, we need to add the user to ->mk_users. (Neither is
+ * applicable for v1 policy keys, which have NULL ->mk_users.)
*/
if (mk->mk_users) {
- mk_user = find_master_key_user(mk);
+ struct key *mk_user = find_master_key_user(mk);
+
if (mk_user != ERR_PTR(-ENOKEY)) {
if (IS_ERR(mk_user))
return PTR_ERR(mk_user);
key_put(mk_user);
return 0;
}
- }
-
- /* If we'll be re-adding ->mk_secret, try to take the reference. */
- rekey = !is_master_key_secret_present(&mk->mk_secret);
- if (rekey && !refcount_inc_not_zero(&mk->mk_refcount))
- return KEY_DEAD;
-
- /* Add the current user to ->mk_users, if applicable. */
- if (mk->mk_users) {
err = add_master_key_user(mk);
- if (err) {
- if (rekey && refcount_dec_and_test(&mk->mk_refcount))
- return KEY_DEAD;
+ if (err)
return err;
- }
}
/* Re-add the secret if needed. */
- if (rekey)
+ if (!is_master_key_secret_present(&mk->mk_secret)) {
+ if (!refcount_inc_not_zero(&mk->mk_active_refs))
+ return KEY_DEAD;
move_master_key_secret(&mk->mk_secret, secret);
+ }
+
return 0;
}
@@ -436,38 +491,36 @@ static int do_add_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec)
{
static DEFINE_MUTEX(fscrypt_add_key_mutex);
- struct key *key;
+ struct fscrypt_master_key *mk;
int err;
mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */
-retry:
- key = fscrypt_find_master_key(sb, mk_spec);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- if (err != -ENOKEY)
- goto out_unlock;
+
+ mk = fscrypt_find_master_key(sb, mk_spec);
+ if (!mk) {
/* Didn't find the key in ->s_master_keys. Add it. */
err = allocate_filesystem_keyring(sb);
- if (err)
- goto out_unlock;
- err = add_new_master_key(secret, mk_spec, sb->s_master_keys);
+ if (!err)
+ err = add_new_master_key(sb, secret, mk_spec);
} else {
/*
* Found the key in ->s_master_keys. Re-add the secret if
* needed, and add the user to ->mk_users if needed.
*/
- down_write(&key->sem);
- err = add_existing_master_key(key->payload.data[0], secret);
- up_write(&key->sem);
+ down_write(&mk->mk_sem);
+ err = add_existing_master_key(mk, secret);
+ up_write(&mk->mk_sem);
if (err == KEY_DEAD) {
- /* Key being removed or needs to be removed */
- key_invalidate(key);
- key_put(key);
- goto retry;
+ /*
+ * We found a key struct, but it's already been fully
+ * removed. Ignore the old struct and add a new one.
+ * fscrypt_add_key_mutex means we don't need to worry
+ * about concurrent adds.
+ */
+ err = add_new_master_key(sb, secret, mk_spec);
}
- key_put(key);
+ fscrypt_put_master_key(mk);
}
-out_unlock:
mutex_unlock(&fscrypt_add_key_mutex);
return err;
}
@@ -688,28 +741,68 @@ out_wipe_secret:
}
EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
-/*
- * Add the key for '-o test_dummy_encryption' to the filesystem keyring.
- *
- * Use a per-boot random key to prevent people from misusing this option.
- */
-int fscrypt_add_test_dummy_key(struct super_block *sb,
- struct fscrypt_key_specifier *key_spec)
+static void
+fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret)
{
static u8 test_key[FSCRYPT_MAX_KEY_SIZE];
+
+ get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE);
+
+ memset(secret, 0, sizeof(*secret));
+ secret->size = FSCRYPT_MAX_KEY_SIZE;
+ memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE);
+}
+
+int fscrypt_get_test_dummy_key_identifier(
+ u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
+{
struct fscrypt_master_key_secret secret;
int err;
- get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE);
+ fscrypt_get_test_dummy_secret(&secret);
- memset(&secret, 0, sizeof(secret));
- secret.size = FSCRYPT_MAX_KEY_SIZE;
- memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE);
+ err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
+ if (err)
+ goto out;
+ err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER,
+ NULL, 0, key_identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+out:
+ wipe_master_key_secret(&secret);
+ return err;
+}
+
+/**
+ * fscrypt_add_test_dummy_key() - add the test dummy encryption key
+ * @sb: the filesystem instance to add the key to
+ * @dummy_policy: the encryption policy for test_dummy_encryption
+ *
+ * If needed, add the key for the test_dummy_encryption mount option to the
+ * filesystem. To prevent misuse of this mount option, a per-boot random key is
+ * used instead of a hardcoded one. This makes it so that any encrypted files
+ * created using this option won't be accessible after a reboot.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_add_test_dummy_key(struct super_block *sb,
+ const struct fscrypt_dummy_policy *dummy_policy)
+{
+ const union fscrypt_policy *policy = dummy_policy->policy;
+ struct fscrypt_key_specifier key_spec;
+ struct fscrypt_master_key_secret secret;
+ int err;
- err = add_master_key(sb, &secret, key_spec);
+ if (!policy)
+ return 0;
+ err = fscrypt_policy_to_key_spec(policy, &key_spec);
+ if (err)
+ return err;
+ fscrypt_get_test_dummy_secret(&secret);
+ err = add_master_key(sb, &secret, &key_spec);
wipe_master_key_secret(&secret);
return err;
}
+EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key);
/*
* Verify that the current user has added a master key with the given identifier
@@ -731,19 +824,19 @@ int fscrypt_verify_key_added(struct super_block *sb,
const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
{
struct fscrypt_key_specifier mk_spec;
- struct key *key, *mk_user;
struct fscrypt_master_key *mk;
+ struct key *mk_user;
int err;
mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
- key = fscrypt_find_master_key(sb, &mk_spec);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ mk = fscrypt_find_master_key(sb, &mk_spec);
+ if (!mk) {
+ err = -ENOKEY;
goto out;
}
- mk = key->payload.data[0];
+ down_read(&mk->mk_sem);
mk_user = find_master_key_user(mk);
if (IS_ERR(mk_user)) {
err = PTR_ERR(mk_user);
@@ -751,7 +844,8 @@ int fscrypt_verify_key_added(struct super_block *sb,
key_put(mk_user);
err = 0;
}
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
out:
if (err == -ENOKEY && capable(CAP_FOWNER))
err = 0;
@@ -913,11 +1007,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
struct super_block *sb = file_inode(filp)->i_sb;
struct fscrypt_remove_key_arg __user *uarg = _uarg;
struct fscrypt_remove_key_arg arg;
- struct key *key;
struct fscrypt_master_key *mk;
u32 status_flags = 0;
int err;
- bool dead;
+ bool inodes_remain;
if (copy_from_user(&arg, uarg, sizeof(arg)))
return -EFAULT;
@@ -937,12 +1030,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
return -EACCES;
/* Find the key being removed. */
- key = fscrypt_find_master_key(sb, &arg.key_spec);
- if (IS_ERR(key))
- return PTR_ERR(key);
- mk = key->payload.data[0];
-
- down_write(&key->sem);
+ mk = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (!mk)
+ return -ENOKEY;
+ down_write(&mk->mk_sem);
/* If relevant, remove current user's (or all users) claim to the key */
if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -951,7 +1042,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
else
err = remove_master_key_user(mk);
if (err) {
- up_write(&key->sem);
+ up_write(&mk->mk_sem);
goto out_put_key;
}
if (mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -963,26 +1054,22 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
status_flags |=
FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS;
err = 0;
- up_write(&key->sem);
+ up_write(&mk->mk_sem);
goto out_put_key;
}
}
/* No user claims remaining. Go ahead and wipe the secret. */
- dead = false;
+ err = -ENOKEY;
if (is_master_key_secret_present(&mk->mk_secret)) {
wipe_master_key_secret(&mk->mk_secret);
- dead = refcount_dec_and_test(&mk->mk_refcount);
- }
- up_write(&key->sem);
- if (dead) {
- /*
- * No inodes reference the key, and we wiped the secret, so the
- * key object is free to be removed from the keyring.
- */
- key_invalidate(key);
+ fscrypt_put_master_key_activeref(mk);
err = 0;
- } else {
+ }
+ inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
+ up_write(&mk->mk_sem);
+
+ if (inodes_remain) {
/* Some inodes still reference this key; try to evict them. */
err = try_to_lock_encrypted_files(sb, mk);
if (err == -EBUSY) {
@@ -998,7 +1085,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
* has been fully removed including all files locked.
*/
out_put_key:
- key_put(key);
+ fscrypt_put_master_key(mk);
if (err == 0)
err = put_user(status_flags, &uarg->removal_status_flags);
return err;
@@ -1045,7 +1132,6 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
{
struct super_block *sb = file_inode(filp)->i_sb;
struct fscrypt_get_key_status_arg arg;
- struct key *key;
struct fscrypt_master_key *mk;
int err;
@@ -1062,19 +1148,18 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
arg.user_count = 0;
memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved));
- key = fscrypt_find_master_key(sb, &arg.key_spec);
- if (IS_ERR(key)) {
- if (key != ERR_PTR(-ENOKEY))
- return PTR_ERR(key);
+ mk = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (!mk) {
arg.status = FSCRYPT_KEY_STATUS_ABSENT;
err = 0;
goto out;
}
- mk = key->payload.data[0];
- down_read(&key->sem);
+ down_read(&mk->mk_sem);
if (!is_master_key_secret_present(&mk->mk_secret)) {
- arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED;
+ arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
+ FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
+ FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
err = 0;
goto out_release_key;
}
@@ -1096,8 +1181,8 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
}
err = 0;
out_release_key:
- up_read(&key->sem);
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
out:
if (!err && copy_to_user(uarg, &arg, sizeof(arg)))
err = -EFAULT;
@@ -1109,13 +1194,9 @@ int __init fscrypt_init_keyring(void)
{
int err;
- err = register_key_type(&key_type_fscrypt);
- if (err)
- return err;
-
err = register_key_type(&key_type_fscrypt_user);
if (err)
- goto err_unregister_fscrypt;
+ return err;
err = register_key_type(&key_type_fscrypt_provisioning);
if (err)
@@ -1125,7 +1206,5 @@ int __init fscrypt_init_keyring(void)
err_unregister_fscrypt_user:
unregister_key_type(&key_type_fscrypt_user);
-err_unregister_fscrypt:
- unregister_key_type(&key_type_fscrypt);
return err;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index eede186b04ce..f7407071a952 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -9,7 +9,6 @@
*/
#include <crypto/skcipher.h>
-#include <linux/key.h>
#include <linux/random.h>
#include "fscrypt_private.h"
@@ -53,6 +52,13 @@ struct fscrypt_mode fscrypt_modes[] = {
.ivsize = 32,
.blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
},
+ [FSCRYPT_MODE_AES_256_HCTR2] = {
+ .friendly_name = "AES-256-HCTR2",
+ .cipher_str = "hctr2(aes)",
+ .keysize = 32,
+ .security_strength = 32,
+ .ivsize = 32,
+ },
};
static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex);
@@ -94,7 +100,7 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
mode->cipher_str, PTR_ERR(tfm));
return tfm;
}
- if (!xchg(&mode->logged_impl_name, 1)) {
+ if (!xchg(&mode->logged_cryptoapi_impl, 1)) {
/*
* fscrypt performance can vary greatly depending on which
* crypto algorithm implementation is used. Help people debug
@@ -148,10 +154,12 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
}
/* Destroy a crypto transform object and/or blk-crypto key. */
-void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key)
+void fscrypt_destroy_prepared_key(struct super_block *sb,
+ struct fscrypt_prepared_key *prep_key)
{
crypto_free_skcipher(prep_key->tfm);
- fscrypt_destroy_inline_crypt_key(prep_key);
+ fscrypt_destroy_inline_crypt_key(sb, prep_key);
+ memzero_explicit(prep_key, sizeof(*prep_key));
}
/* Given a per-file encryption key, set up the file's crypto transform object */
@@ -405,49 +413,32 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
/*
* Find the master key, then set up the inode's actual encryption key.
*
- * If the master key is found in the filesystem-level keyring, then the
- * corresponding 'struct key' is returned in *master_key_ret with its semaphore
- * read-locked. This is needed to ensure that only one task links the
- * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create
- * an fscrypt_info for the same inode), and to synchronize the master key being
- * removed with a new inode starting to use it.
+ * If the master key is found in the filesystem-level keyring, then it is
+ * returned in *mk_ret with its semaphore read-locked. This is needed to ensure
+ * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
+ * multiple tasks may race to create an fscrypt_info for the same inode), and to
+ * synchronize the master key being removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
bool need_dirhash_key,
- struct key **master_key_ret)
+ struct fscrypt_master_key **mk_ret)
{
- struct key *key;
- struct fscrypt_master_key *mk = NULL;
struct fscrypt_key_specifier mk_spec;
+ struct fscrypt_master_key *mk;
int err;
err = fscrypt_select_encryption_impl(ci);
if (err)
return err;
- switch (ci->ci_policy.version) {
- case FSCRYPT_POLICY_V1:
- mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
- memcpy(mk_spec.u.descriptor,
- ci->ci_policy.v1.master_key_descriptor,
- FSCRYPT_KEY_DESCRIPTOR_SIZE);
- break;
- case FSCRYPT_POLICY_V2:
- mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
- memcpy(mk_spec.u.identifier,
- ci->ci_policy.v2.master_key_identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
- break;
- default:
- WARN_ON(1);
- return -EINVAL;
- }
+ err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec);
+ if (err)
+ return err;
- key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
- if (IS_ERR(key)) {
- if (key != ERR_PTR(-ENOKEY) ||
- ci->ci_policy.version != FSCRYPT_POLICY_V1)
- return PTR_ERR(key);
+ mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+ if (!mk) {
+ if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
+ return -ENOKEY;
/*
* As a legacy fallback for v1 policies, search for the key in
@@ -457,9 +448,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
*/
return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
}
-
- mk = key->payload.data[0];
- down_read(&key->sem);
+ down_read(&mk->mk_sem);
/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
if (!is_master_key_secret_present(&mk->mk_secret)) {
@@ -487,18 +476,18 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
if (err)
goto out_release_key;
- *master_key_ret = key;
+ *mk_ret = mk;
return 0;
out_release_key:
- up_read(&key->sem);
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
return err;
}
static void put_crypt_info(struct fscrypt_info *ci)
{
- struct key *key;
+ struct fscrypt_master_key *mk;
if (!ci)
return;
@@ -506,26 +495,21 @@ static void put_crypt_info(struct fscrypt_info *ci)
if (ci->ci_direct_key)
fscrypt_put_direct_key(ci->ci_direct_key);
else if (ci->ci_owns_key)
- fscrypt_destroy_prepared_key(&ci->ci_enc_key);
-
- key = ci->ci_master_key;
- if (key) {
- struct fscrypt_master_key *mk = key->payload.data[0];
+ fscrypt_destroy_prepared_key(ci->ci_inode->i_sb,
+ &ci->ci_enc_key);
+ mk = ci->ci_master_key;
+ if (mk) {
/*
* Remove this inode from the list of inodes that were unlocked
- * with the master key.
- *
- * In addition, if we're removing the last inode from a key that
- * already had its secret removed, invalidate the key so that it
- * gets removed from ->s_master_keys.
+ * with the master key. In addition, if we're removing the last
+ * inode from a master key struct that already had its secret
+ * removed, then complete the full removal of the struct.
*/
spin_lock(&mk->mk_decrypted_inodes_lock);
list_del(&ci->ci_master_key_link);
spin_unlock(&mk->mk_decrypted_inodes_lock);
- if (refcount_dec_and_test(&mk->mk_refcount))
- key_invalidate(key);
- key_put(key);
+ fscrypt_put_master_key_activeref(mk);
}
memzero_explicit(ci, sizeof(*ci));
kmem_cache_free(fscrypt_info_cachep, ci);
@@ -539,7 +523,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
{
struct fscrypt_info *crypt_info;
struct fscrypt_mode *mode;
- struct key *master_key = NULL;
+ struct fscrypt_master_key *mk = NULL;
int res;
res = fscrypt_initialize(inode->i_sb->s_cop->flags);
@@ -562,8 +546,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
- res = setup_file_encryption_key(crypt_info, need_dirhash_key,
- &master_key);
+ res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
if (res)
goto out;
@@ -578,12 +561,9 @@ fscrypt_setup_encryption_info(struct inode *inode,
* We won the race and set ->i_crypt_info to our crypt_info.
* Now link it into the master key's inode list.
*/
- if (master_key) {
- struct fscrypt_master_key *mk =
- master_key->payload.data[0];
-
- refcount_inc(&mk->mk_refcount);
- crypt_info->ci_master_key = key_get(master_key);
+ if (mk) {
+ crypt_info->ci_master_key = mk;
+ refcount_inc(&mk->mk_active_refs);
spin_lock(&mk->mk_decrypted_inodes_lock);
list_add(&crypt_info->ci_master_key_link,
&mk->mk_decrypted_inodes);
@@ -593,9 +573,9 @@ fscrypt_setup_encryption_info(struct inode *inode,
}
res = 0;
out:
- if (master_key) {
- up_read(&master_key->sem);
- key_put(master_key);
+ if (mk) {
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
}
put_crypt_info(crypt_info);
return res;
@@ -760,7 +740,6 @@ EXPORT_SYMBOL(fscrypt_free_inode);
int fscrypt_drop_inode(struct inode *inode)
{
const struct fscrypt_info *ci = fscrypt_get_info(inode);
- const struct fscrypt_master_key *mk;
/*
* If ci is NULL, then the inode doesn't have an encryption key set up
@@ -770,7 +749,6 @@ int fscrypt_drop_inode(struct inode *inode)
*/
if (!ci || !ci->ci_master_key)
return 0;
- mk = ci->ci_master_key->payload.data[0];
/*
* With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
@@ -789,6 +767,6 @@ int fscrypt_drop_inode(struct inode *inode)
* then the thread removing the key will either evict the inode itself
* or will correctly detect that it wasn't evicted due to the race.
*/
- return !is_master_key_secret_present(&mk->mk_secret);
+ return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
}
EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 2762c5350432..75dabd9b27f9 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -143,6 +143,7 @@ invalid:
/* Master key referenced by DIRECT_KEY policy */
struct fscrypt_direct_key {
+ struct super_block *dk_sb;
struct hlist_node dk_node;
refcount_t dk_refcount;
const struct fscrypt_mode *dk_mode;
@@ -154,7 +155,7 @@ struct fscrypt_direct_key {
static void free_direct_key(struct fscrypt_direct_key *dk)
{
if (dk) {
- fscrypt_destroy_prepared_key(&dk->dk_key);
+ fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key);
kfree_sensitive(dk);
}
}
@@ -231,6 +232,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
dk = kzalloc(sizeof(*dk), GFP_KERNEL);
if (!dk)
return ERR_PTR(-ENOMEM);
+ dk->dk_sb = ci->ci_inode->i_sb;
refcount_set(&dk->dk_refcount, 1);
dk->dk_mode = ci->ci_mode;
err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index ed3d623724cd..46757c3052ef 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,6 +10,7 @@
* Modified by Eric Biggers, 2019 for v2 policy support.
*/
+#include <linux/fs_context.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
@@ -32,6 +33,26 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy,
+ struct fscrypt_key_specifier *key_spec)
+{
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
+ memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE);
+ return 0;
+ case FSCRYPT_POLICY_V2:
+ key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
+ memcpy(key_spec->u.identifier, policy->v2.master_key_identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE);
+ return 0;
+ default:
+ WARN_ON(1);
+ return -EINVAL;
+ }
+}
+
static const union fscrypt_policy *
fscrypt_get_dummy_policy(struct super_block *sb)
{
@@ -40,7 +61,7 @@ fscrypt_get_dummy_policy(struct super_block *sb)
return sb->s_cop->get_dummy_policy(sb);
}
-static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
+static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
filenames_mode == FSCRYPT_MODE_AES_256_CTS)
@@ -57,6 +78,14 @@ static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
return false;
}
+static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
+{
+ if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+ filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
+ return true;
+ return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
+}
+
static bool supported_direct_key_modes(const struct inode *inode,
u32 contents_mode, u32 filenames_mode)
{
@@ -130,7 +159,7 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy,
const struct inode *inode)
{
- if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode,
policy->filenames_encryption_mode)) {
fscrypt_warn(inode,
"Unsupported encryption modes (contents %d, filenames %d)",
@@ -166,7 +195,7 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
{
int count = 0;
- if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode,
policy->filenames_encryption_mode)) {
fscrypt_warn(inode,
"Unsupported encryption modes (contents %d, filenames %d)",
@@ -665,6 +694,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
}
/**
+ * fscrypt_context_for_new_inode() - create an encryption context for a new inode
+ * @ctx: where context should be written
+ * @inode: inode from which to fetch policy and nonce
+ *
+ * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode,
+ * generate a new context and write it to ctx. ctx _must_ be at least
+ * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes.
+ *
+ * Return: size of the resulting context or a negative error code.
+ */
+int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
+{
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ BUILD_BUG_ON(sizeof(union fscrypt_context) !=
+ FSCRYPT_SET_CONTEXT_MAX_SIZE);
+
+ /* fscrypt_prepare_new_inode() should have set up the key already. */
+ if (WARN_ON_ONCE(!ci))
+ return -ENOKEY;
+
+ return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce);
+}
+EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);
+
+/**
* fscrypt_set_context() - Set the fscrypt context of a new inode
* @inode: a new inode
* @fs_data: private data given by FS and passed to ->set_context()
@@ -680,97 +735,62 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
union fscrypt_context ctx;
int ctxsize;
- /* fscrypt_prepare_new_inode() should have set up the key already. */
- if (WARN_ON_ONCE(!ci))
- return -ENOKEY;
-
- BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+ ctxsize = fscrypt_context_for_new_inode(&ctx, inode);
+ if (ctxsize < 0)
+ return ctxsize;
/*
* This may be the first time the inode number is available, so do any
* delayed key setup that requires the inode number.
*/
if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
- (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
- const struct fscrypt_master_key *mk =
- ci->ci_master_key->payload.data[0];
-
- fscrypt_hash_inode_number(ci, mk);
- }
+ (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+ fscrypt_hash_inode_number(ci, ci->ci_master_key);
return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
EXPORT_SYMBOL_GPL(fscrypt_set_context);
/**
- * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption'
- * @sb: the filesystem on which test_dummy_encryption is being specified
- * @arg: the argument to the test_dummy_encryption option. May be NULL.
- * @dummy_policy: the filesystem's current dummy policy (input/output, see
- * below)
- *
- * Handle the test_dummy_encryption mount option by creating a dummy encryption
- * policy, saving it in @dummy_policy, and adding the corresponding dummy
- * encryption key to the filesystem. If the @dummy_policy is already set, then
- * instead validate that it matches @arg. Don't support changing it via
- * remount, as that is difficult to do safely.
+ * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option
+ * @param: the mount option
+ * @dummy_policy: (input/output) the place to write the dummy policy that will
+ * result from parsing the option. Zero-initialize this. If a policy is
+ * already set here (due to test_dummy_encryption being given multiple
+ * times), then this function will verify that the policies are the same.
*
- * Return: 0 on success (dummy policy set, or the same policy is already set);
- * -EEXIST if a different dummy policy is already set;
- * or another -errno value.
+ * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the
+ * argument conflicts with one already specified; or -ENOMEM.
*/
-int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
- struct fscrypt_dummy_policy *dummy_policy)
+int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct fscrypt_dummy_policy *dummy_policy)
{
- struct fscrypt_key_specifier key_spec = { 0 };
- int version;
- union fscrypt_policy *policy = NULL;
+ const char *arg = "v2";
+ union fscrypt_policy *policy;
int err;
- if (!arg)
- arg = "v2";
-
- if (!strcmp(arg, "v1")) {
- version = FSCRYPT_POLICY_V1;
- key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
- memset(key_spec.u.descriptor, 0x42,
- FSCRYPT_KEY_DESCRIPTOR_SIZE);
- } else if (!strcmp(arg, "v2")) {
- version = FSCRYPT_POLICY_V2;
- key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
- /* key_spec.u.identifier gets filled in when adding the key */
- } else {
- err = -EINVAL;
- goto out;
- }
+ if (param->type == fs_value_is_string && *param->string)
+ arg = param->string;
policy = kzalloc(sizeof(*policy), GFP_KERNEL);
- if (!policy) {
- err = -ENOMEM;
- goto out;
- }
-
- err = fscrypt_add_test_dummy_key(sb, &key_spec);
- if (err)
- goto out;
+ if (!policy)
+ return -ENOMEM;
- policy->version = version;
- switch (policy->version) {
- case FSCRYPT_POLICY_V1:
+ if (!strcmp(arg, "v1")) {
+ policy->version = FSCRYPT_POLICY_V1;
policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor,
+ memset(policy->v1.master_key_descriptor, 0x42,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
- break;
- case FSCRYPT_POLICY_V2:
+ } else if (!strcmp(arg, "v2")) {
+ policy->version = FSCRYPT_POLICY_V2;
policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(policy->v2.master_key_identifier, key_spec.u.identifier,
- FSCRYPT_KEY_IDENTIFIER_SIZE);
- break;
- default:
- WARN_ON(1);
+ err = fscrypt_get_test_dummy_key_identifier(
+ policy->v2.master_key_identifier);
+ if (err)
+ goto out;
+ } else {
err = -EINVAL;
goto out;
}
@@ -789,7 +809,25 @@ out:
kfree(policy);
return err;
}
-EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
+EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption);
+
+/**
+ * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal
+ * @p1: the first test dummy policy (may be unset)
+ * @p2: the second test dummy policy (may be unset)
+ *
+ * Return: %true if the dummy policies are both set and equal, or both unset.
+ */
+bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
+ const struct fscrypt_dummy_policy *p2)
+{
+ if (!p1->policy && !p2->policy)
+ return true;
+ if (!p1->policy || !p2->policy)
+ return false;
+ return fscrypt_policies_equal(p1->policy, p2->policy);
+}
+EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal);
/**
* fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption'
diff --git a/fs/d_path.c b/fs/d_path.c
index e4e0ebad1f15..56a6ee4c6331 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -34,7 +34,7 @@ static bool prepend_char(struct prepend_buffer *p, unsigned char c)
}
/*
- * The source of the prepend data can be an optimistoc load
+ * The source of the prepend data can be an optimistic load
* of a dentry name and length. And because we don't hold any
* locks, the length and the pointer to the name may not be
* in sync if a concurrent rename happens, and the kernel
@@ -297,8 +297,7 @@ EXPORT_SYMBOL(d_path);
/*
* Helper function for dentry_operations.d_dname() members
*/
-char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
- const char *fmt, ...)
+char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...)
{
va_list args;
char temp[64];
diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..1c6867810cbd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
+#include <linux/rmap.h>
#include <asm/pgalloc.h>
#define CREATE_TRACE_POINTS
@@ -333,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
+static inline bool dax_mapping_is_cow(struct address_space *mapping)
+{
+ return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+}
+
+/*
+ * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ */
+static inline void dax_mapping_set_cow(struct page *page)
+{
+ if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+ /*
+ * Reset the index if the page was already mapped
+ * regularly before.
+ */
+ if (page->mapping)
+ page->index = 1;
+ page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+ }
+ page->index++;
+}
+
/*
- * TODO: for reflink+dax we need a way to associate a single page with
- * multiple address_space instances at different linear_page_index()
- * offsets.
+ * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * whether this entry is shared by multiple files. If so, set the page->mapping
+ * FS_DAX_MAPPING_COW, and use page->index as refcount.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool cow)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@@ -351,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
+ if (cow) {
+ dax_mapping_set_cow(page);
+ } else {
+ WARN_ON_ONCE(page->mapping);
+ page->mapping = mapping;
+ page->index = index + i++;
+ }
}
}
@@ -369,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+ if (dax_mapping_is_cow(page->mapping)) {
+ /* keep the CoW flag if this page is still shared */
+ if (page->index-- > 0)
+ continue;
+ } else
+ WARN_ON_ONCE(page->mapping && page->mapping != mapping);
page->mapping = NULL;
page->index = 0;
}
@@ -455,6 +487,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie)
}
/*
+ * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
+ * @mapping: the file's mapping whose entry we want to lock
+ * @index: the offset within this file
+ * @page: output the dax page corresponding to this dax entry
+ *
+ * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
+ * could not be locked.
+ */
+dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ struct page **page)
+{
+ XA_STATE(xas, NULL, 0);
+ void *entry;
+
+ rcu_read_lock();
+ for (;;) {
+ entry = NULL;
+ if (!dax_mapping(mapping))
+ break;
+
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ rcu_read_unlock();
+ wait_entry_unlocked(&xas, entry);
+ rcu_read_lock();
+ continue;
+ }
+ if (!entry ||
+ dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ /*
+ * Because we are looking for entry from file's mapping
+ * and index, so the entry may not be inserted for now,
+ * or even a zero/empty entry. We don't think this is
+ * an error case. So, return a special value and do
+ * not output @page.
+ */
+ entry = (void *)~0UL;
+ } else {
+ *page = pfn_to_page(dax_to_pfn(entry));
+ dax_lock_entry(&xas, entry);
+ }
+ xas_unlock_irq(&xas);
+ break;
+ }
+ rcu_read_unlock();
+ return (dax_entry_t)entry;
+}
+
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie)
+{
+ XA_STATE(xas, &mapping->i_pages, index);
+
+ if (cookie == ~0UL)
+ return;
+
+ dax_unlock_entry(&xas, (void *)cookie);
+}
+
+/*
* Find page cache entry at given index. If it is a DAX entry, return it
* with the entry locked. If the page cache doesn't contain an entry at
* that index, add a locked empty entry.
@@ -721,7 +816,8 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
int id;
id = dax_read_lock();
- rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
+ rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
+ &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
@@ -734,22 +830,42 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter
}
/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
+ struct vm_area_struct *vma)
+{
+ return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
+ (iter->iomap.flags & IOMAP_F_DIRTY);
+}
+
+static bool dax_fault_is_cow(const struct iomap_iter *iter)
+{
+ return (iter->flags & IOMAP_WRITE) &&
+ (iter->iomap.flags & IOMAP_F_SHARED);
+}
+
+/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
* PTEs. If we happen to be trying to insert a PTE and there is a PMD
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_entry(struct xa_state *xas,
- struct address_space *mapping, struct vm_fault *vmf,
- void *entry, pfn_t pfn, unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap_iter *iter, void *entry, pfn_t pfn,
+ unsigned long flags)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
+ bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
+ bool cow = dax_fault_is_cow(iter);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -761,11 +877,12 @@ static void *dax_insert_entry(struct xa_state *xas,
xas_reset(xas);
xas_lock_irq(xas);
- if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
+ dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
+ cow);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -785,99 +902,19 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
+ if (cow)
+ xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
+
xas_unlock_irq(xas);
return entry;
}
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
- unsigned long address;
-
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- return address;
-}
-
-/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
- unsigned long pfn)
-{
- struct vm_area_struct *vma;
- pte_t pte, *ptep = NULL;
- pmd_t *pmdp = NULL;
- spinlock_t *ptl;
-
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- struct mmu_notifier_range range;
- unsigned long address;
-
- cond_resched();
-
- if (!(vma->vm_flags & VM_SHARED))
- continue;
-
- address = pgoff_address(index, vma);
-
- /*
- * follow_invalidate_pte() will use the range to call
- * mmu_notifier_invalidate_range_start() on our behalf before
- * taking any lock.
- */
- if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
- &pmdp, &ptl))
- continue;
-
- /*
- * No need to call mmu_notifier_invalidate_range() as we are
- * downgrading page table protection not changing it to point
- * to a new page.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
- pmd_t pmd;
-
- if (pfn != pmd_pfn(*pmdp))
- goto unlock_pmd;
- if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
- goto unlock_pmd;
-
- flush_cache_page(vma, address, pfn);
- pmd = pmdp_invalidate(vma, address, pmdp);
- pmd = pmd_wrprotect(pmd);
- pmd = pmd_mkclean(pmd);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
- spin_unlock(ptl);
- } else {
- if (pfn != pte_pfn(*ptep))
- goto unlock_pte;
- if (!pte_dirty(*ptep) && !pte_write(*ptep))
- goto unlock_pte;
-
- flush_cache_page(vma, address, pfn);
- pte = ptep_clear_flush(vma, address, ptep);
- pte = pte_wrprotect(pte);
- pte = pte_mkclean(pte);
- set_pte_at(vma->vm_mm, address, ptep, pte);
-unlock_pte:
- pte_unmap_unlock(ptep, ptl);
- }
-
- mmu_notifier_invalidate_range_end(&range);
- }
- i_mmap_unlock_read(mapping);
-}
-
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
struct address_space *mapping, void *entry)
{
- unsigned long pfn, index, count;
+ unsigned long pfn, index, count, end;
long ret = 0;
+ struct vm_area_struct *vma;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -935,8 +972,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
pfn = dax_to_pfn(entry);
count = 1UL << dax_entry_order(entry);
index = xas->xa_index & ~(count - 1);
+ end = index + count - 1;
+
+ /* Walk all mappings of a given index of a file and writeprotect them */
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+ pfn_mkclean_range(pfn, count, index, vma);
+ cond_resched();
+ }
+ i_mmap_unlock_read(mapping);
- dax_entry_mkclean(mapping, index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
/*
* After we have flushed the cache, we can clear the dirty tag. There
@@ -1004,20 +1049,22 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
-static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
- pfn_t *pfnp)
+static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
+ size_t size, void **kaddr, pfn_t *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- int id, rc;
+ int id, rc = 0;
long length;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
- NULL, pfnp);
+ DAX_ACCESS, kaddr, pfnp);
if (length < 0) {
rc = length;
goto out;
}
+ if (!pfnp)
+ goto out_check_addr;
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
@@ -1027,11 +1074,71 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
+
+out_check_addr:
+ if (!kaddr)
+ goto out;
+ if (!*kaddr)
+ rc = -EFAULT;
out:
dax_read_unlock(id);
return rc;
}
+/**
+ * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * @pos: address to do copy from.
+ * @length: size of copy operation.
+ * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
+ * @srcmap: iomap srcmap
+ * @daddr: destination address to copy to.
+ *
+ * This can be called from two places. Either during DAX write fault (page
+ * aligned), to copy the length size data to daddr. Or, while doing normal DAX
+ * write operation, dax_iomap_actor() might call this to do the copy of either
+ * start or end unaligned address. In the latter case the rest of the copy of
+ * aligned ranges is taken care by dax_iomap_actor() itself.
+ */
+static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+ const struct iomap *srcmap, void *daddr)
+{
+ loff_t head_off = pos & (align_size - 1);
+ size_t size = ALIGN(head_off + length, align_size);
+ loff_t end = pos + length;
+ loff_t pg_end = round_up(end, align_size);
+ bool copy_all = head_off == 0 && end == pg_end;
+ void *saddr = 0;
+ int ret = 0;
+
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+
+ if (copy_all) {
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ return ret ? -EIO : 0;
+ }
+
+ /* Copy the head part of the range */
+ if (head_off) {
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ if (ret)
+ return -EIO;
+ }
+
+ /* Copy the tail part of the range */
+ if (end < pg_end) {
+ loff_t tail_off = head_off + length;
+ loff_t tail_len = pg_end - end;
+
+ ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
+ tail_len);
+ if (ret)
+ return -EIO;
+ }
+ return 0;
+}
+
/*
* The user has performed a load from a hole in the file. Allocating a new
* page in the file would cause excessive storage usage for workloads with
@@ -1039,17 +1146,15 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static vm_fault_t dax_load_hole(struct xa_state *xas,
- struct address_space *mapping, void **entry,
- struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ const struct iomap_iter *iter, void **entry)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = iter->inode;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@@ -1058,7 +1163,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
#ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap *iomap, void **entry)
+ const struct iomap_iter *iter, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1076,8 +1181,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE);
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
@@ -1110,23 +1215,34 @@ fallback:
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap *iomap, void **entry)
+ const struct iomap_iter *iter, void **entry)
{
return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */
-static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
- unsigned int offset, size_t size)
+static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
+ const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ unsigned offset = offset_in_page(pos);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
void *kaddr;
long ret;
- ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
- if (ret > 0) {
- memset(kaddr + offset, 0, size);
- dax_flush(dax_dev, kaddr + offset, size);
- }
+ ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
+ NULL);
+ if (ret < 0)
+ return ret;
+ memset(kaddr + offset, 0, size);
+ if (srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
+ kaddr);
+ if (ret < 0)
+ return ret;
+ dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
+ } else
+ dax_flush(iomap->dax_dev, kaddr + offset, size);
return ret;
}
@@ -1153,7 +1269,7 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
- rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
+ rc = dax_memzero(iter, pos, size);
dax_read_unlock(id);
if (rc < 0)
@@ -1161,10 +1277,10 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
pos += size;
length -= size;
written += size;
- if (did_zero)
- *did_zero = true;
} while (length > 0);
+ if (did_zero)
+ *did_zero = true;
return written;
}
@@ -1202,15 +1318,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
+ const struct iomap *srcmap = &iomi->srcmap;
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
+ bool write = iov_iter_rw(iter) == WRITE;
ssize_t ret = 0;
size_t xfer;
int id;
- if (iov_iter_rw(iter) == READ) {
+ if (!write) {
end = min(end, i_size_read(iomi->inode));
if (pos >= end)
return 0;
@@ -1219,7 +1337,12 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
return iov_iter_zero(min(length, end - pos), iter);
}
- if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+ /*
+ * In DAX mode, enforce either pure overwrites of written extents, or
+ * writes to unwritten extents as part of a copy-on-write operation.
+ */
+ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
+ !(iomap->flags & IOMAP_F_SHARED)))
return -EIO;
/*
@@ -1239,6 +1362,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
ssize_t map_len;
+ bool recovery = false;
void *kaddr;
if (fatal_signal_pending(current)) {
@@ -1247,19 +1371,37 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
}
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
- &kaddr, NULL);
+ DAX_ACCESS, &kaddr, NULL);
+ if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+ map_len = dax_direct_access(dax_dev, pgoff,
+ PHYS_PFN(size), DAX_RECOVERY_WRITE,
+ &kaddr, NULL);
+ if (map_len > 0)
+ recovery = true;
+ }
if (map_len < 0) {
ret = map_len;
break;
}
+ if (write &&
+ srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+ ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
+ kaddr);
+ if (ret)
+ break;
+ }
+
map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset;
if (map_len > end - pos)
map_len = end - pos;
- if (iov_iter_rw(iter) == WRITE)
+ if (recovery)
+ xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
+ map_len, iter);
+ else if (write)
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
@@ -1303,6 +1445,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t done = 0;
int ret;
+ if (!iomi.len)
+ return 0;
+
if (iov_iter_rw(iter) == WRITE) {
lockdep_assert_held_write(&iomi.inode->i_rwsem);
iomi.flags |= IOMAP_WRITE;
@@ -1330,17 +1475,6 @@ static vm_fault_t dax_fault_return(int error)
}
/*
- * MAP_SYNC on a dax mapping guarantees dirty metadata is
- * flushed on write-faults (non-cow), but not read-faults.
- */
-static bool dax_fault_is_synchronous(unsigned long flags,
- struct vm_area_struct *vma, const struct iomap *iomap)
-{
- return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
- && (iomap->flags & IOMAP_F_DIRTY);
-}
-
-/*
* When handling a synchronous page fault and the inode need a fsync, we can
* insert the PTE/PMD into page tables only after that fsync happened. Skip
* insertion for now and return the pfn so that caller can insert it after the
@@ -1397,15 +1531,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
const struct iomap_iter *iter, pfn_t *pfnp,
struct xa_state *xas, void **entry, bool pmd)
{
- struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = &iter->srcmap;
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
- bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap);
+ bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
int err = 0;
pfn_t pfn;
+ void *kaddr;
if (!pmd && vmf->cow_page)
return dax_fault_cow_page(vmf, iter);
@@ -1414,23 +1548,29 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
if (!write &&
(iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
if (!pmd)
- return dax_load_hole(xas, mapping, entry, vmf);
- return dax_pmd_load_hole(xas, vmf, iomap, entry);
+ return dax_load_hole(xas, vmf, iter, entry);
+ return dax_pmd_load_hole(xas, vmf, iter, entry);
}
- if (iomap->type != IOMAP_MAPPED) {
+ if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
WARN_ON_ONCE(1);
return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
}
- err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn);
+ err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
if (err)
return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
- *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags,
- write && !sync);
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
+
+ if (write &&
+ srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
+ err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+ if (err)
+ return dax_fault_return(err);
+ }
- if (sync)
+ if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
/* insert PMD pfn */
@@ -1736,3 +1876,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+ struct iomap_iter *it_dest, u64 len, bool *same)
+{
+ const struct iomap *smap = &it_src->iomap;
+ const struct iomap *dmap = &it_dest->iomap;
+ loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+ void *saddr, *daddr;
+ int id, ret;
+
+ len = min(len, min(smap->length, dmap->length));
+
+ if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
+ *same = true;
+ return len;
+ }
+
+ if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
+ *same = false;
+ return 0;
+ }
+
+ id = dax_read_lock();
+ ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
+ &saddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
+ &daddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ *same = !memcmp(saddr, daddr, len);
+ if (!*same)
+ len = 0;
+ dax_read_unlock(id);
+ return len;
+
+out_unlock:
+ dax_read_unlock(id);
+ return -EIO;
+}
+
+int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+ struct inode *dst, loff_t dstoff, loff_t len, bool *same,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter src_iter = {
+ .inode = src,
+ .pos = srcoff,
+ .len = len,
+ .flags = IOMAP_DAX,
+ };
+ struct iomap_iter dst_iter = {
+ .inode = dst,
+ .pos = dstoff,
+ .len = len,
+ .flags = IOMAP_DAX,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&src_iter, ops)) > 0) {
+ while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
+ dst_iter.processed = dax_range_compare_iter(&src_iter,
+ &dst_iter, len, same);
+ }
+ if (ret <= 0)
+ src_iter.processed = ret;
+ }
+ return ret;
+}
+
+int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *ops)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, ops);
+}
+EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
diff --git a/fs/dcache.c b/fs/dcache.c
index 93f4f5ee07bf..bb0c4d0038db 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2240,6 +2240,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
}
res = d_splice_alias(inode, found);
if (res) {
+ d_lookup_done(found);
dput(found);
return res;
}
@@ -2247,10 +2248,16 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
}
EXPORT_SYMBOL(d_add_ci);
-
-static inline bool d_same_name(const struct dentry *dentry,
- const struct dentry *parent,
- const struct qstr *name)
+/**
+ * d_same_name - compare dentry name with case-exact name
+ * @parent: parent dentry
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name: the case-exact name to be associated with the returned dentry
+ *
+ * Return: true if names are same, or false
+ */
+bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
+ const struct qstr *name)
{
if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
if (dentry->d_name.len != name->len)
@@ -2261,6 +2268,49 @@ static inline bool d_same_name(const struct dentry *dentry,
dentry->d_name.len, dentry->d_name.name,
name) == 0;
}
+EXPORT_SYMBOL_GPL(d_same_name);
+
+/*
+ * This is __d_lookup_rcu() when the parent dentry has
+ * DCACHE_OP_COMPARE, which makes things much nastier.
+ */
+static noinline struct dentry *__d_lookup_rcu_op_compare(
+ const struct dentry *parent,
+ const struct qstr *name,
+ unsigned *seqp)
+{
+ u64 hashlen = name->hash_len;
+ struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+ struct hlist_bl_node *node;
+ struct dentry *dentry;
+
+ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ int tlen;
+ const char *tname;
+ unsigned seq;
+
+seqretry:
+ seq = raw_seqcount_begin(&dentry->d_seq);
+ if (dentry->d_parent != parent)
+ continue;
+ if (d_unhashed(dentry))
+ continue;
+ if (dentry->d_name.hash != hashlen_hash(hashlen))
+ continue;
+ tlen = dentry->d_name.len;
+ tname = dentry->d_name.name;
+ /* we want a consistent (name,len) pair */
+ if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ cpu_relax();
+ goto seqretry;
+ }
+ if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0)
+ continue;
+ *seqp = seq;
+ return dentry;
+ }
+ return NULL;
+}
/**
* __d_lookup_rcu - search for a dentry (racy, store-free)
@@ -2308,6 +2358,9 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
* Keep the two functions in sync.
*/
+ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE))
+ return __d_lookup_rcu_op_compare(parent, name, seqp);
+
/*
* The hash list is protected using RCU.
*
@@ -2324,7 +2377,6 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
unsigned seq;
-seqretry:
/*
* The dentry sequence count protects us from concurrent
* renames, and thus protects parent and name fields.
@@ -2347,28 +2399,10 @@ seqretry:
continue;
if (d_unhashed(dentry))
continue;
-
- if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
- int tlen;
- const char *tname;
- if (dentry->d_name.hash != hashlen_hash(hashlen))
- continue;
- tlen = dentry->d_name.len;
- tname = dentry->d_name.name;
- /* we want a consistent (name,len) pair */
- if (read_seqcount_retry(&dentry->d_seq, seq)) {
- cpu_relax();
- goto seqretry;
- }
- if (parent->d_op->d_compare(dentry,
- tlen, tname, name) != 0)
- continue;
- } else {
- if (dentry->d_name.hash_len != hashlen)
- continue;
- if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
- continue;
- }
+ if (dentry->d_name.hash_len != hashlen)
+ continue;
+ if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ continue;
*seqp = seq;
return dentry;
}
@@ -2563,7 +2597,15 @@ EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
-
+ /*
+ * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT
+ * kernels spin_lock() implicitly disables preemption, but not on
+ * PREEMPT_RT. So for RT it has to be done explicitly to protect
+ * the sequence count write side critical section against a reader
+ * or another writer preempting, which would result in a live lock.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
for (;;) {
unsigned n = dir->i_dir_seq;
if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
@@ -2572,9 +2614,13 @@ static inline unsigned start_dir_add(struct inode *dir)
}
}
-static inline void end_dir_add(struct inode *dir, unsigned n)
+static inline void end_dir_add(struct inode *dir, unsigned int n,
+ wait_queue_head_t *d_wait)
{
smp_store_release(&dir->i_dir_seq, n + 2);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ wake_up_all(d_wait);
}
static void d_wait_lookup(struct dentry *dentry)
@@ -2701,32 +2747,50 @@ mismatch:
}
EXPORT_SYMBOL(d_alloc_parallel);
-void __d_lookup_done(struct dentry *dentry)
+/*
+ * - Unhash the dentry
+ * - Retrieve and clear the waitqueue head in dentry
+ * - Return the waitqueue head
+ */
+static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry)
{
- struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
- dentry->d_name.hash);
+ wait_queue_head_t *d_wait;
+ struct hlist_bl_head *b;
+
+ lockdep_assert_held(&dentry->d_lock);
+
+ b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash);
hlist_bl_lock(b);
dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
- wake_up_all(dentry->d_wait);
+ d_wait = dentry->d_wait;
dentry->d_wait = NULL;
hlist_bl_unlock(b);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
INIT_LIST_HEAD(&dentry->d_lru);
+ return d_wait;
+}
+
+void __d_lookup_unhash_wake(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ wake_up_all(__d_lookup_unhash(dentry));
+ spin_unlock(&dentry->d_lock);
}
-EXPORT_SYMBOL(__d_lookup_done);
+EXPORT_SYMBOL(__d_lookup_unhash_wake);
/* inode->i_lock held if inode is non-NULL */
static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
+ wait_queue_head_t *d_wait;
struct inode *dir = NULL;
unsigned n;
spin_lock(&dentry->d_lock);
if (unlikely(d_in_lookup(dentry))) {
dir = dentry->d_parent->d_inode;
n = start_dir_add(dir);
- __d_lookup_done(dentry);
+ d_wait = __d_lookup_unhash(dentry);
}
if (inode) {
unsigned add_flags = d_flags_for_inode(inode);
@@ -2738,7 +2802,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
}
__d_rehash(dentry);
if (dir)
- end_dir_add(dir, n);
+ end_dir_add(dir, n, d_wait);
spin_unlock(&dentry->d_lock);
if (inode)
spin_unlock(&inode->i_lock);
@@ -2885,6 +2949,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
bool exchange)
{
struct dentry *old_parent, *p;
+ wait_queue_head_t *d_wait;
struct inode *dir = NULL;
unsigned n;
@@ -2915,7 +2980,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
if (unlikely(d_in_lookup(target))) {
dir = target->d_parent->d_inode;
n = start_dir_add(dir);
- __d_lookup_done(target);
+ d_wait = __d_lookup_unhash(target);
}
write_seqcount_begin(&dentry->d_seq);
@@ -2951,7 +3016,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
write_seqcount_end(&dentry->d_seq);
if (dir)
- end_dir_add(dir, n);
+ end_dir_add(dir, n, d_wait);
if (dentry->d_parent != old_parent)
spin_unlock(&dentry->d_parent->d_lock);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 950c63fa4d0b..ddb3fc258df9 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1121,7 +1121,7 @@ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
}
EXPORT_SYMBOL_GPL(debugfs_print_regs32);
-static int debugfs_show_regset32(struct seq_file *s, void *data)
+static int debugfs_regset32_show(struct seq_file *s, void *data)
{
struct debugfs_regset32 *regset = s->private;
@@ -1136,17 +1136,7 @@ static int debugfs_show_regset32(struct seq_file *s, void *data)
return 0;
}
-static int debugfs_open_regset32(struct inode *inode, struct file *file)
-{
- return single_open(file, debugfs_show_regset32, inode->i_private);
-}
-
-static const struct file_operations fops_regset32 = {
- .open = debugfs_open_regset32,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(debugfs_regset32);
/**
* debugfs_create_regset32 - create a debugfs file that returns register values
@@ -1167,7 +1157,7 @@ void debugfs_create_regset32(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_regset32 *regset)
{
- debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+ debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_regset32);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dcf0b8b4e93..2e8e112b1993 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -82,6 +82,8 @@ struct debugfs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -111,6 +113,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = DEBUGFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -145,24 +148,44 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
* but traditionally debugfs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int debugfs_apply_options(struct super_block *sb)
+static void _debugfs_apply_options(struct super_block *sb, bool remount)
{
struct debugfs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct debugfs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
+
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
- return 0;
+ if (!remount || opts->opts & BIT(Opt_gid))
+ inode->i_gid = opts->gid;
+}
+
+static void debugfs_apply_options(struct super_block *sb)
+{
+ _debugfs_apply_options(sb, false);
+}
+
+static void debugfs_apply_options_remount(struct super_block *sb)
+{
+ _debugfs_apply_options(sb, true);
}
static int debugfs_remount(struct super_block *sb, int *flags, char *data)
@@ -175,7 +198,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto fail;
- debugfs_apply_options(sb);
+ debugfs_apply_options_remount(sb);
fail:
return err;
@@ -745,6 +768,28 @@ void debugfs_remove(struct dentry *dentry)
EXPORT_SYMBOL_GPL(debugfs_remove);
/**
+ * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it
+ * @name: a pointer to a string containing the name of the item to look up.
+ * @parent: a pointer to the parent dentry of the item.
+ *
+ * This is the equlivant of doing something like
+ * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting
+ * handled for the directory being looked up.
+ */
+void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+
+ dentry = debugfs_lookup(name, parent);
+ if (!dentry)
+ return;
+
+ debugfs_remove(dentry);
+ dput(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);
+
+/**
* debugfs_rename - rename a file/directory in the debugfs filesystem
* @old_dir: a pointer to the parent dentry for the renamed object. This
* should be a directory dentry.
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aef06e607b40..03d381377ae1 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -117,8 +117,7 @@ struct dio_submit {
/* dio_state communicated between submission path and end_io */
struct dio {
int flags; /* doesn't change */
- int op;
- int op_flags;
+ blk_opf_t opf; /* request operation type and flags */
struct gendisk *bio_disk;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
@@ -167,12 +166,13 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
*/
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
ssize_t ret;
- ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
+ ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
&sdio->from);
- if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) {
+ if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
struct page *page = ZERO_PAGE(0);
/*
* A memory fault, but the filesystem has some outstanding
@@ -191,7 +191,6 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
}
if (ret >= 0) {
- iov_iter_advance(sdio->iter, ret);
ret += sdio->from;
sdio->head = 0;
sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
@@ -234,6 +233,7 @@ static inline struct page *dio_get_page(struct dio *dio,
*/
static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
{
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
loff_t offset = dio->iocb->ki_pos;
ssize_t transferred = 0;
int err;
@@ -251,7 +251,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
transferred = dio->result;
/* Check for short read case */
- if ((dio->op == REQ_OP_READ) &&
+ if (dio_op == REQ_OP_READ &&
((offset + transferred) > dio->i_size))
transferred = dio->i_size - offset;
/* ignore EFAULT if some IO has been done */
@@ -286,7 +286,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
* zeros from unwritten extents.
*/
if (flags & DIO_COMPLETE_INVALIDATE &&
- ret > 0 && dio->op == REQ_OP_WRITE &&
+ ret > 0 && dio_op == REQ_OP_WRITE &&
dio->inode->i_mapping->nrpages) {
err = invalidate_inode_pages2_range(dio->inode->i_mapping,
offset >> PAGE_SHIFT,
@@ -305,7 +305,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
*/
dio->iocb->ki_pos += transferred;
- if (ret > 0 && dio->op == REQ_OP_WRITE)
+ if (ret > 0 && dio_op == REQ_OP_WRITE)
ret = generic_write_sync(dio->iocb, ret);
dio->iocb->ki_complete(dio->iocb, ret);
}
@@ -329,6 +329,7 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
static void dio_bio_end_aio(struct bio *bio)
{
struct dio *dio = bio->bi_private;
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
unsigned long remaining;
unsigned long flags;
bool defer_completion = false;
@@ -353,7 +354,7 @@ static void dio_bio_end_aio(struct bio *bio)
*/
if (dio->result)
defer_completion = dio->defer_completion ||
- (dio->op == REQ_OP_WRITE &&
+ (dio_op == REQ_OP_WRITE &&
dio->inode->i_mapping->nrpages);
if (defer_completion) {
INIT_WORK(&dio->complete_work, dio_aio_complete_work);
@@ -396,7 +397,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
* bio_alloc() is guaranteed to return a bio when allowed to sleep and
* we request a valid number of vectors.
*/
- bio = bio_alloc(bdev, nr_vecs, dio->op | dio->op_flags, GFP_KERNEL);
+ bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL);
bio->bi_iter.bi_sector = first_sector;
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
@@ -415,18 +416,17 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
*/
static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
{
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
struct bio *bio = sdio->bio;
unsigned long flags;
bio->bi_private = dio;
- /* don't account direct I/O as memory stall */
- bio_clear_flag(bio, BIO_WORKINGSET);
spin_lock_irqsave(&dio->bio_lock, flags);
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
+ if (dio->is_async && dio_op == REQ_OP_READ && dio->should_dirty)
bio_set_pages_dirty(bio);
dio->bio_disk = bio->bi_bdev->bd_disk;
@@ -492,7 +492,8 @@ static struct bio *dio_await_one(struct dio *dio)
static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
{
blk_status_t err = bio->bi_status;
- bool should_dirty = dio->op == REQ_OP_READ && dio->should_dirty;
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
+ bool should_dirty = dio_op == REQ_OP_READ && dio->should_dirty;
if (err) {
if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT))
@@ -619,6 +620,7 @@ static int dio_set_defer_completion(struct dio *dio)
static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
struct buffer_head *map_bh)
{
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
int ret;
sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
@@ -653,7 +655,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
* which may decide to handle it or also return an unmapped
* buffer head.
*/
- create = dio->op == REQ_OP_WRITE;
+ create = dio_op == REQ_OP_WRITE;
if (dio->flags & DIO_SKIP_HOLES) {
i_size = i_size_read(dio->inode);
if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
@@ -801,10 +803,11 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
unsigned offset, unsigned len, sector_t blocknr,
struct buffer_head *map_bh)
{
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
int ret = 0;
int boundary = sdio->boundary; /* dio_send_cur_page may clear it */
- if (dio->op == REQ_OP_WRITE) {
+ if (dio_op == REQ_OP_WRITE) {
/*
* Read accounting is performed in submit_bio()
*/
@@ -917,6 +920,7 @@ static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
struct buffer_head *map_bh)
{
+ const enum req_op dio_op = dio->opf & REQ_OP_MASK;
const unsigned blkbits = sdio->blkbits;
const unsigned i_blkbits = blkbits + sdio->blkfactor;
int ret = 0;
@@ -992,7 +996,7 @@ do_holes:
loff_t i_size_aligned;
/* AKPM: eargh, -ENOTBLK is a hack */
- if (dio->op == REQ_OP_WRITE) {
+ if (dio_op == REQ_OP_WRITE) {
put_page(page);
return -ENOTBLK;
}
@@ -1115,11 +1119,10 @@ static inline int drop_refcount(struct dio *dio)
* individual fields and will generate much worse code. This is important
* for the whole file.
*/
-static inline ssize_t
-do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter,
- get_block_t get_block, dio_iodone_t end_io,
- dio_submit_t submit_io, int flags)
+ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, struct iov_iter *iter,
+ get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
{
unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
@@ -1197,12 +1200,11 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->inode = inode;
if (iov_iter_rw(iter) == WRITE) {
- dio->op = REQ_OP_WRITE;
- dio->op_flags = REQ_SYNC | REQ_IDLE;
+ dio->opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
if (iocb->ki_flags & IOCB_NOWAIT)
- dio->op_flags |= REQ_NOWAIT;
+ dio->opf |= REQ_NOWAIT;
} else {
- dio->op = REQ_OP_READ;
+ dio->opf = REQ_OP_READ;
}
/*
@@ -1211,7 +1213,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
if (dio->is_async && iov_iter_rw(iter) == WRITE) {
retval = 0;
- if (iocb->ki_flags & IOCB_DSYNC)
+ if (iocb_is_dsync(iocb))
retval = dio_set_defer_completion(dio);
else if (!dio->inode->i_sb->s_dio_done_wq) {
/*
@@ -1246,7 +1248,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
- dio->should_dirty = iter_is_iovec(iter) && iov_iter_rw(iter) == READ;
+ dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ;
sdio.iter = iter;
sdio.final_block_in_request = end >> blkbits;
@@ -1334,29 +1336,6 @@ fail_dio:
kmem_cache_free(dio_cache, dio);
return retval;
}
-
-ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter,
- get_block_t get_block,
- dio_iodone_t end_io, dio_submit_t submit_io,
- int flags)
-{
- /*
- * The block device state is needed in the end to finally
- * submit everything. Since it's likely to be cache cold
- * prefetch it here as first thing to hide some of the
- * latency.
- *
- * Attempt to prefetch the pieces we likely need later.
- */
- prefetch(&bdev->bd_disk->part_tbl);
- prefetch(bdev->bd_disk->queue);
- prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES);
-
- return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
- end_io, submit_io, flags);
-}
-
EXPORT_SYMBOL(__blockdev_direct_IO);
static __init int dio_init(void)
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index ee92634196a8..1105ce3c80cb 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -9,6 +9,15 @@ menuconfig DLM
A general purpose distributed lock manager for kernel or userspace
applications.
+config DLM_DEPRECATED_API
+ bool "DLM deprecated API"
+ depends on DLM
+ help
+ Enables deprecated DLM timeout features that will be removed in
+ later Linux kernel releases.
+
+ If you are unsure, say N.
+
config DLM_DEBUG
bool "DLM debugging"
depends on DLM
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 3545fdafc6fb..71dab733cf9a 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -9,7 +9,6 @@ dlm-y := ast.o \
member.o \
memory.o \
midcomms.o \
- netlink.o \
lowcomms.o \
plock.o \
rcom.o \
@@ -18,5 +17,6 @@ dlm-y := ast.o \
requestqueue.o \
user.o \
util.o
+dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o
dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index bfac462dd3e8..d60a8d8f109d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -200,13 +200,13 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
if (!prev_seq) {
kref_get(&lkb->lkb_ref);
+ mutex_lock(&ls->ls_cb_mutex);
if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
- mutex_lock(&ls->ls_cb_mutex);
list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
- mutex_unlock(&ls->ls_cb_mutex);
} else {
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
}
+ mutex_unlock(&ls->ls_cb_mutex);
}
out:
mutex_unlock(&lkb->lkb_cb_mutex);
@@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work)
if (callbacks[i].flags & DLM_CB_SKIP) {
continue;
} else if (callbacks[i].flags & DLM_CB_BAST) {
- bastfn(lkb->lkb_astparam, callbacks[i].mode);
trace_dlm_bast(ls, lkb, callbacks[i].mode);
+ bastfn(lkb->lkb_astparam, callbacks[i].mode);
} else if (callbacks[i].flags & DLM_CB_CAST) {
lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
+ trace_dlm_ast(ls, lkb);
castfn(lkb->lkb_astparam);
- trace_dlm_ast(ls, lkb, lkb->lkb_lksb);
}
}
@@ -288,10 +288,13 @@ void dlm_callback_stop(struct dlm_ls *ls)
void dlm_callback_suspend(struct dlm_ls *ls)
{
- set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ if (ls->ls_callback_wq) {
+ mutex_lock(&ls->ls_cb_mutex);
+ set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ mutex_unlock(&ls->ls_cb_mutex);
- if (ls->ls_callback_wq)
flush_workqueue(ls->ls_callback_wq);
+ }
}
#define MAX_CB_QUEUE 25
@@ -302,11 +305,11 @@ void dlm_callback_resume(struct dlm_ls *ls)
int count = 0, sum = 0;
bool empty;
- clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
-
if (!ls->ls_callback_wq)
return;
+ clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
+
more:
mutex_lock(&ls->ls_cb_mutex);
list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 181ad7d20c4d..e5e05fcc5813 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -11,7 +11,6 @@
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
-void dlm_del_ast(struct dlm_lkb *lkb);
int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
int status, uint32_t sbflags, uint64_t seq);
int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 42eee2783756..ac8b62106ce0 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -75,8 +75,9 @@ struct dlm_cluster {
unsigned int cl_log_info;
unsigned int cl_protocol;
unsigned int cl_mark;
+#ifdef CONFIG_DLM_DEPRECATED_API
unsigned int cl_timewarn_cs;
- unsigned int cl_waitwarn_us;
+#endif
unsigned int cl_new_rsb_count;
unsigned int cl_recover_callbacks;
char cl_cluster_name[DLM_LOCKSPACE_LEN];
@@ -102,8 +103,9 @@ enum {
CLUSTER_ATTR_LOG_INFO,
CLUSTER_ATTR_PROTOCOL,
CLUSTER_ATTR_MARK,
+#ifdef CONFIG_DLM_DEPRECATED_API
CLUSTER_ATTR_TIMEWARN_CS,
- CLUSTER_ATTR_WAITWARN_US,
+#endif
CLUSTER_ATTR_NEW_RSB_COUNT,
CLUSTER_ATTR_RECOVER_CALLBACKS,
CLUSTER_ATTR_CLUSTER_NAME,
@@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL);
CLUSTER_ATTR(log_info, NULL);
CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
CLUSTER_ATTR(mark, NULL);
+#ifdef CONFIG_DLM_DEPRECATED_API
CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
-CLUSTER_ATTR(waitwarn_us, NULL);
+#endif
CLUSTER_ATTR(new_rsb_count, NULL);
CLUSTER_ATTR(recover_callbacks, NULL);
@@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info,
[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
[CLUSTER_ATTR_MARK] = &cluster_attr_mark,
+#ifdef CONFIG_DLM_DEPRECATED_API
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
- [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
+#endif
[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
@@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_log_debug = dlm_config.ci_log_debug;
cl->cl_log_info = dlm_config.ci_log_info;
cl->cl_protocol = dlm_config.ci_protocol;
+#ifdef CONFIG_DLM_DEPRECATED_API
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
- cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
+#endif
cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
@@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_LOG_INFO 1
#define DEFAULT_PROTOCOL DLM_PROTO_TCP
#define DEFAULT_MARK 0
+#ifdef CONFIG_DLM_DEPRECATED_API
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
-#define DEFAULT_WAITWARN_US 0
+#endif
#define DEFAULT_NEW_RSB_COUNT 128
#define DEFAULT_RECOVER_CALLBACKS 0
#define DEFAULT_CLUSTER_NAME ""
@@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = {
.ci_log_info = DEFAULT_LOG_INFO,
.ci_protocol = DEFAULT_PROTOCOL,
.ci_mark = DEFAULT_MARK,
+#ifdef CONFIG_DLM_DEPRECATED_API
.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
- .ci_waitwarn_us = DEFAULT_WAITWARN_US,
+#endif
.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
.ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
.ci_cluster_name = DEFAULT_CLUSTER_NAME
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index df92b0a07fc6..55c5f2c13ebd 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -37,8 +37,9 @@ struct dlm_config_info {
int ci_log_info;
int ci_protocol;
int ci_mark;
+#ifdef CONFIG_DLM_DEPRECATED_API
int ci_timewarn_cs;
- int ci_waitwarn_us;
+#endif
int ci_new_rsb_count;
int ci_recover_callbacks;
char ci_cluster_name[DLM_LOCKSPACE_LEN];
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index b6692f81ec83..fb1981654bb2 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -101,7 +101,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
*/
b = ls->ls_recover_buf->rc_buf;
- left = ls->ls_recover_buf->rc_header.h_length;
+ left = le16_to_cpu(ls->ls_recover_buf->rc_header.h_length);
left -= sizeof(struct dlm_rcom);
for (;;) {
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 74a9590a4dd5..e34c3d2639a5 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -145,7 +145,9 @@ struct dlm_args {
void (*bastfn) (void *astparam, int mode);
int mode;
struct dlm_lksb *lksb;
+#ifdef CONFIG_DLM_DEPRECATED_API
unsigned long timeout;
+#endif
};
@@ -203,10 +205,20 @@ struct dlm_args {
#define DLM_IFL_OVERLAP_UNLOCK 0x00080000
#define DLM_IFL_OVERLAP_CANCEL 0x00100000
#define DLM_IFL_ENDOFLIFE 0x00200000
+#ifdef CONFIG_DLM_DEPRECATED_API
#define DLM_IFL_WATCH_TIMEWARN 0x00400000
#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
+#endif
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
+/* least significant 2 bytes are message changed, they are full transmitted
+ * but at receive side only the 2 bytes LSB will be set.
+ *
+ * Even wireshark dlm dissector does only evaluate the lower bytes and note
+ * that they may not be used on transceiver side, we assume the higher bytes
+ * are for internal use or reserved so long they are not parsed on receiver
+ * side.
+ */
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
@@ -249,10 +261,12 @@ struct dlm_lkb {
struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
struct list_head lkb_wait_reply; /* waiting for remote reply */
struct list_head lkb_ownqueue; /* list of locks for a process */
- struct list_head lkb_time_list;
ktime_t lkb_timestamp;
- ktime_t lkb_wait_time;
+
+#ifdef CONFIG_DLM_DEPRECATED_API
+ struct list_head lkb_time_list;
unsigned long lkb_timeout_cs;
+#endif
struct mutex lkb_cb_mutex;
struct work_struct lkb_cb_work;
@@ -379,15 +393,15 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
#define DLM_FIN 5
struct dlm_header {
- uint32_t h_version;
+ __le32 h_version;
union {
/* for DLM_MSG and DLM_RCOM */
- uint32_t h_lockspace;
+ __le32 h_lockspace;
/* for DLM_ACK and DLM_OPTS */
- uint32_t h_seq;
+ __le32 h_seq;
} u;
- uint32_t h_nodeid; /* nodeid of sender */
- uint16_t h_length;
+ __le32 h_nodeid; /* nodeid of sender */
+ __le16 h_length;
uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
uint8_t h_pad;
};
@@ -409,24 +423,24 @@ struct dlm_header {
struct dlm_message {
struct dlm_header m_header;
- uint32_t m_type; /* DLM_MSG_ */
- uint32_t m_nodeid;
- uint32_t m_pid;
- uint32_t m_lkid; /* lkid on sender */
- uint32_t m_remid; /* lkid on receiver */
- uint32_t m_parent_lkid;
- uint32_t m_parent_remid;
- uint32_t m_exflags;
- uint32_t m_sbflags;
- uint32_t m_flags;
- uint32_t m_lvbseq;
- uint32_t m_hash;
- int m_status;
- int m_grmode;
- int m_rqmode;
- int m_bastmode;
- int m_asts;
- int m_result; /* 0 or -EXXX */
+ __le32 m_type; /* DLM_MSG_ */
+ __le32 m_nodeid;
+ __le32 m_pid;
+ __le32 m_lkid; /* lkid on sender */
+ __le32 m_remid; /* lkid on receiver */
+ __le32 m_parent_lkid;
+ __le32 m_parent_remid;
+ __le32 m_exflags;
+ __le32 m_sbflags;
+ __le32 m_flags;
+ __le32 m_lvbseq;
+ __le32 m_hash;
+ __le32 m_status;
+ __le32 m_grmode;
+ __le32 m_rqmode;
+ __le32 m_bastmode;
+ __le32 m_asts;
+ __le32 m_result; /* 0 or -EXXX */
char m_extra[]; /* name or lvb */
};
@@ -451,18 +465,18 @@ struct dlm_message {
struct dlm_rcom {
struct dlm_header rc_header;
- uint32_t rc_type; /* DLM_RCOM_ */
- int rc_result; /* multi-purpose */
- uint64_t rc_id; /* match reply with request */
- uint64_t rc_seq; /* sender's ls_recover_seq */
- uint64_t rc_seq_reply; /* remote ls_recover_seq */
+ __le32 rc_type; /* DLM_RCOM_ */
+ __le32 rc_result; /* multi-purpose */
+ __le64 rc_id; /* match reply with request */
+ __le64 rc_seq; /* sender's ls_recover_seq */
+ __le64 rc_seq_reply; /* remote ls_recover_seq */
char rc_buf[];
};
struct dlm_opt_header {
- uint16_t t_type;
- uint16_t t_length;
- uint32_t t_pad;
+ __le16 t_type;
+ __le16 t_length;
+ __le32 t_pad;
/* need to be 8 byte aligned */
char t_value[];
};
@@ -472,8 +486,8 @@ struct dlm_opts {
struct dlm_header o_header;
uint8_t o_nextcmd;
uint8_t o_pad;
- uint16_t o_optlen;
- uint32_t o_pad2;
+ __le16 o_optlen;
+ __le32 o_pad2;
char o_opts[];
};
@@ -568,8 +582,10 @@ struct dlm_ls {
struct mutex ls_orphans_mutex;
struct list_head ls_orphans;
+#ifdef CONFIG_DLM_DEPRECATED_API
struct mutex ls_timeout_mutex;
struct list_head ls_timeout;
+#endif
spinlock_t ls_new_rsb_spin;
int ls_new_rsb_count;
@@ -606,8 +622,8 @@ struct dlm_ls {
wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
int ls_uevent_result;
- struct completion ls_members_done;
- int ls_members_result;
+ struct completion ls_recovery_done;
+ int ls_recovery_result;
struct miscdevice ls_device;
@@ -645,7 +661,7 @@ struct dlm_ls {
spinlock_t ls_recover_idr_lock;
wait_queue_head_t ls_wait_general;
wait_queue_head_t ls_recover_lock_wait;
- struct mutex ls_clear_proc_locks;
+ spinlock_t ls_clear_proc_locks;
struct list_head ls_root_list; /* root resources */
struct rw_semaphore ls_root_sem; /* protect root_list */
@@ -688,7 +704,9 @@ struct dlm_ls {
#define LSFL_RCOM_READY 5
#define LSFL_RCOM_WAIT 6
#define LSFL_UEVENT_WAIT 7
+#ifdef CONFIG_DLM_DEPRECATED_API
#define LSFL_TIMEWARN 8
+#endif
#define LSFL_CB_DELAY 9
#define LSFL_NODIR 10
@@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
return test_bit(LSFL_NODIR, &ls->ls_flags);
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_netlink_init(void);
void dlm_netlink_exit(void);
void dlm_timeout_warn(struct dlm_lkb *lkb);
+#else
+static inline int dlm_netlink_init(void) { return 0; }
+static inline void dlm_netlink_exit(void) { };
+static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { };
+#endif
int dlm_plock_init(void);
void dlm_plock_exit(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index bdb51d209ba2..94a72ede5764 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+#ifdef CONFIG_DLM_DEPRECATED_API
/* if the operation was a cancel, then return -DLM_ECANCEL, if a
timeout caused the cancel then return -ETIMEDOUT */
if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
rv = -ETIMEDOUT;
}
+#endif
if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
@@ -350,10 +352,12 @@ static void put_rsb(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
uint32_t bucket = r->res_bucket;
+ int rv;
- spin_lock(&ls->ls_rsbtbl[bucket].lock);
- kref_put(&r->res_ref, toss_rsb);
- spin_unlock(&ls->ls_rsbtbl[bucket].lock);
+ rv = kref_put_lock(&r->res_ref, toss_rsb,
+ &ls->ls_rsbtbl[bucket].lock);
+ if (rv)
+ spin_unlock(&ls->ls_rsbtbl[bucket].lock);
}
void dlm_put_rsb(struct dlm_rsb *r)
@@ -397,7 +401,7 @@ static int pre_rsb_struct(struct dlm_ls *ls)
unlock any spinlocks, go back and call pre_rsb_struct again.
Otherwise, take an rsb off the list and return it. */
-static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
+static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len,
struct dlm_rsb **r_ret)
{
struct dlm_rsb *r;
@@ -408,7 +412,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
count = ls->ls_new_rsb_count;
spin_unlock(&ls->ls_new_rsb_spin);
log_debug(ls, "find_rsb retry %d %d %s",
- count, dlm_config.ci_new_rsb_count, name);
+ count, dlm_config.ci_new_rsb_count,
+ (const char *)name);
return -EAGAIN;
}
@@ -444,7 +449,7 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
}
-int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
struct dlm_rsb **r_ret)
{
struct rb_node *node = tree->rb_node;
@@ -542,7 +547,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
* while that rsb has a potentially stale master.)
*/
-static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
+static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
uint32_t hash, uint32_t b,
int dir_nodeid, int from_nodeid,
unsigned int flags, struct dlm_rsb **r_ret)
@@ -602,7 +607,6 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
*/
kref_get(&r->res_ref);
- error = 0;
goto out_unlock;
@@ -721,7 +725,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
dlm_recover_locks) before we've made ourself master (in
dlm_recover_masters). */
-static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
+static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
uint32_t hash, uint32_t b,
int dir_nodeid, int from_nodeid,
unsigned int flags, struct dlm_rsb **r_ret)
@@ -815,8 +819,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
return error;
}
-static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid,
- unsigned int flags, struct dlm_rsb **r_ret)
+static int find_rsb(struct dlm_ls *ls, const void *name, int len,
+ int from_nodeid, unsigned int flags,
+ struct dlm_rsb **r_ret)
{
uint32_t hash, b;
int dir_nodeid;
@@ -880,6 +885,88 @@ static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
}
}
+static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid,
+ int from_nodeid, bool toss_list, unsigned int flags,
+ int *r_nodeid, int *result)
+{
+ int fix_master = (flags & DLM_LU_RECOVER_MASTER);
+ int from_master = (flags & DLM_LU_RECOVER_DIR);
+
+ if (r->res_dir_nodeid != our_nodeid) {
+ /* should not happen, but may as well fix it and carry on */
+ log_error(ls, "%s res_dir %d our %d %s", __func__,
+ r->res_dir_nodeid, our_nodeid, r->res_name);
+ r->res_dir_nodeid = our_nodeid;
+ }
+
+ if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
+ /* Recovery uses this function to set a new master when
+ * the previous master failed. Setting NEW_MASTER will
+ * force dlm_recover_masters to call recover_master on this
+ * rsb even though the res_nodeid is no longer removed.
+ */
+
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ rsb_set_flag(r, RSB_NEW_MASTER);
+
+ if (toss_list) {
+ /* I don't think we should ever find it on toss list. */
+ log_error(ls, "%s fix_master on toss", __func__);
+ dlm_dump_rsb(r);
+ }
+ }
+
+ if (from_master && (r->res_master_nodeid != from_nodeid)) {
+ /* this will happen if from_nodeid became master during
+ * a previous recovery cycle, and we aborted the previous
+ * cycle before recovering this master value
+ */
+
+ log_limit(ls, "%s from_master %d master_nodeid %d res_nodeid %d first %x %s",
+ __func__, from_nodeid, r->res_master_nodeid,
+ r->res_nodeid, r->res_first_lkid, r->res_name);
+
+ if (r->res_master_nodeid == our_nodeid) {
+ log_error(ls, "from_master %d our_master", from_nodeid);
+ dlm_dump_rsb(r);
+ goto ret_assign;
+ }
+
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ rsb_set_flag(r, RSB_NEW_MASTER);
+ }
+
+ if (!r->res_master_nodeid) {
+ /* this will happen if recovery happens while we're looking
+ * up the master for this rsb
+ */
+
+ log_debug(ls, "%s master 0 to %d first %x %s", __func__,
+ from_nodeid, r->res_first_lkid, r->res_name);
+ r->res_master_nodeid = from_nodeid;
+ r->res_nodeid = from_nodeid;
+ }
+
+ if (!from_master && !fix_master &&
+ (r->res_master_nodeid == from_nodeid)) {
+ /* this can happen when the master sends remove, the dir node
+ * finds the rsb on the keep list and ignores the remove,
+ * and the former master sends a lookup
+ */
+
+ log_limit(ls, "%s from master %d flags %x first %x %s",
+ __func__, from_nodeid, flags, r->res_first_lkid,
+ r->res_name);
+ }
+
+ ret_assign:
+ *r_nodeid = r->res_master_nodeid;
+ if (result)
+ *result = DLM_LU_MATCH;
+}
+
/*
* We're the dir node for this res and another node wants to know the
* master nodeid. During normal operation (non recovery) this is only
@@ -914,10 +1001,8 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
{
struct dlm_rsb *r = NULL;
uint32_t hash, b;
- int from_master = (flags & DLM_LU_RECOVER_DIR);
- int fix_master = (flags & DLM_LU_RECOVER_MASTER);
int our_nodeid = dlm_our_nodeid();
- int dir_nodeid, error, toss_list = 0;
+ int dir_nodeid, error;
if (len > DLM_RESNAME_MAXLEN)
return -EINVAL;
@@ -949,12 +1034,21 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
if (!error) {
/* because the rsb is active, we need to lock_rsb before
- checking/changing re_master_nodeid */
+ * checking/changing re_master_nodeid
+ */
hold_rsb(r);
spin_unlock(&ls->ls_rsbtbl[b].lock);
lock_rsb(r);
- goto found;
+
+ __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false,
+ flags, r_nodeid, result);
+
+ /* the rsb was active */
+ unlock_rsb(r);
+ put_rsb(r);
+
+ return 0;
}
error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
@@ -962,90 +1056,16 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
goto not_found;
/* because the rsb is inactive (on toss list), it's not refcounted
- and lock_rsb is not used, but is protected by the rsbtbl lock */
-
- toss_list = 1;
- found:
- if (r->res_dir_nodeid != our_nodeid) {
- /* should not happen, but may as well fix it and carry on */
- log_error(ls, "dlm_master_lookup res_dir %d our %d %s",
- r->res_dir_nodeid, our_nodeid, r->res_name);
- r->res_dir_nodeid = our_nodeid;
- }
-
- if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
- /* Recovery uses this function to set a new master when
- the previous master failed. Setting NEW_MASTER will
- force dlm_recover_masters to call recover_master on this
- rsb even though the res_nodeid is no longer removed. */
-
- r->res_master_nodeid = from_nodeid;
- r->res_nodeid = from_nodeid;
- rsb_set_flag(r, RSB_NEW_MASTER);
-
- if (toss_list) {
- /* I don't think we should ever find it on toss list. */
- log_error(ls, "dlm_master_lookup fix_master on toss");
- dlm_dump_rsb(r);
- }
- }
-
- if (from_master && (r->res_master_nodeid != from_nodeid)) {
- /* this will happen if from_nodeid became master during
- a previous recovery cycle, and we aborted the previous
- cycle before recovering this master value */
-
- log_limit(ls, "dlm_master_lookup from_master %d "
- "master_nodeid %d res_nodeid %d first %x %s",
- from_nodeid, r->res_master_nodeid, r->res_nodeid,
- r->res_first_lkid, r->res_name);
-
- if (r->res_master_nodeid == our_nodeid) {
- log_error(ls, "from_master %d our_master", from_nodeid);
- dlm_dump_rsb(r);
- goto out_found;
- }
-
- r->res_master_nodeid = from_nodeid;
- r->res_nodeid = from_nodeid;
- rsb_set_flag(r, RSB_NEW_MASTER);
- }
-
- if (!r->res_master_nodeid) {
- /* this will happen if recovery happens while we're looking
- up the master for this rsb */
-
- log_debug(ls, "dlm_master_lookup master 0 to %d first %x %s",
- from_nodeid, r->res_first_lkid, r->res_name);
- r->res_master_nodeid = from_nodeid;
- r->res_nodeid = from_nodeid;
- }
-
- if (!from_master && !fix_master &&
- (r->res_master_nodeid == from_nodeid)) {
- /* this can happen when the master sends remove, the dir node
- finds the rsb on the keep list and ignores the remove,
- and the former master sends a lookup */
+ * and lock_rsb is not used, but is protected by the rsbtbl lock
+ */
- log_limit(ls, "dlm_master_lookup from master %d flags %x "
- "first %x %s", from_nodeid, flags,
- r->res_first_lkid, r->res_name);
- }
+ __dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags,
+ r_nodeid, result);
- out_found:
- *r_nodeid = r->res_master_nodeid;
- if (result)
- *result = DLM_LU_MATCH;
+ r->res_toss_time = jiffies;
+ /* the rsb was inactive (on toss list) */
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
- if (toss_list) {
- r->res_toss_time = jiffies;
- /* the rsb was inactive (on toss list) */
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- } else {
- /* the rsb was active */
- unlock_rsb(r);
- put_rsb(r);
- }
return 0;
not_found:
@@ -1076,7 +1096,6 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
if (result)
*result = DLM_LU_ADD;
*r_nodeid = from_nodeid;
- error = 0;
out_unlock:
spin_unlock(&ls->ls_rsbtbl[b].lock);
return error;
@@ -1195,7 +1214,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
kref_init(&lkb->lkb_ref);
INIT_LIST_HEAD(&lkb->lkb_ownqueue);
INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
+#ifdef CONFIG_DLM_DEPRECATED_API
INIT_LIST_HEAD(&lkb->lkb_time_list);
+#endif
INIT_LIST_HEAD(&lkb->lkb_cb_list);
mutex_init(&lkb->lkb_cb_mutex);
INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
@@ -1253,9 +1274,11 @@ static void kill_lkb(struct kref *kref)
static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
{
uint32_t lkid = lkb->lkb_id;
+ int rv;
- spin_lock(&ls->ls_lkbidr_spin);
- if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+ rv = kref_put_lock(&lkb->lkb_ref, kill_lkb,
+ &ls->ls_lkbidr_spin);
+ if (rv) {
idr_remove(&ls->ls_lkbidr, lkid);
spin_unlock(&ls->ls_lkbidr_spin);
@@ -1265,11 +1288,9 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
if (lkb->lkb_lvbptr && is_master_copy(lkb))
dlm_free_lvb(lkb->lkb_lvbptr);
dlm_free_lkb(lkb);
- return 1;
- } else {
- spin_unlock(&ls->ls_lkbidr_spin);
- return 0;
}
+
+ return rv;
}
int dlm_put_lkb(struct dlm_lkb *lkb)
@@ -1291,6 +1312,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb)
kref_get(&lkb->lkb_ref);
}
+static void unhold_lkb_assert(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ DLM_ASSERT(false, dlm_print_lkb(lkb););
+}
+
/* This is called when we need to remove a reference and are certain
it's not the last ref. e.g. del_lkb is always called between a
find_lkb/put_lkb and is always the inverse of a previous add_lkb.
@@ -1298,21 +1326,23 @@ static inline void hold_lkb(struct dlm_lkb *lkb)
static inline void unhold_lkb(struct dlm_lkb *lkb)
{
- int rv;
- rv = kref_put(&lkb->lkb_ref, kill_lkb);
- DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+ kref_put(&lkb->lkb_ref, unhold_lkb_assert);
}
static void lkb_add_ordered(struct list_head *new, struct list_head *head,
int mode)
{
- struct dlm_lkb *lkb = NULL;
+ struct dlm_lkb *lkb = NULL, *iter;
- list_for_each_entry(lkb, head, lkb_statequeue)
- if (lkb->lkb_rqmode < mode)
+ list_for_each_entry(iter, head, lkb_statequeue)
+ if (iter->lkb_rqmode < mode) {
+ lkb = iter;
+ list_add_tail(new, &iter->lkb_statequeue);
break;
+ }
- __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+ if (!lkb)
+ list_add_tail(new, head);
}
/* add/remove lkb to rsb's grant/convert/wait queue */
@@ -1383,75 +1413,6 @@ static int msg_reply_type(int mstype)
return -1;
}
-static int nodeid_warned(int nodeid, int num_nodes, int *warned)
-{
- int i;
-
- for (i = 0; i < num_nodes; i++) {
- if (!warned[i]) {
- warned[i] = nodeid;
- return 0;
- }
- if (warned[i] == nodeid)
- return 1;
- }
- return 0;
-}
-
-void dlm_scan_waiters(struct dlm_ls *ls)
-{
- struct dlm_lkb *lkb;
- s64 us;
- s64 debug_maxus = 0;
- u32 debug_scanned = 0;
- u32 debug_expired = 0;
- int num_nodes = 0;
- int *warned = NULL;
-
- if (!dlm_config.ci_waitwarn_us)
- return;
-
- mutex_lock(&ls->ls_waiters_mutex);
-
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (!lkb->lkb_wait_time)
- continue;
-
- debug_scanned++;
-
- us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
-
- if (us < dlm_config.ci_waitwarn_us)
- continue;
-
- lkb->lkb_wait_time = 0;
-
- debug_expired++;
- if (us > debug_maxus)
- debug_maxus = us;
-
- if (!num_nodes) {
- num_nodes = ls->ls_num_nodes;
- warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL);
- }
- if (!warned)
- continue;
- if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
- continue;
-
- log_error(ls, "waitwarn %x %lld %d us check connection to "
- "node %d", lkb->lkb_id, (long long)us,
- dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
- }
- mutex_unlock(&ls->ls_waiters_mutex);
- kfree(warned);
-
- if (debug_expired)
- log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
- debug_scanned, debug_expired,
- dlm_config.ci_waitwarn_us, (long long)debug_maxus);
-}
-
/* add/remove lkb from global waiters list of lkb's waiting for
a reply from a remote node */
@@ -1495,7 +1456,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
lkb->lkb_wait_count++;
lkb->lkb_wait_type = mstype;
- lkb->lkb_wait_time = ktime_get();
lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
hold_lkb(lkb);
list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
@@ -1559,6 +1519,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
lkb->lkb_wait_type = 0;
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
goto out_del;
}
@@ -1571,8 +1532,8 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
}
log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait",
- lkb->lkb_id, ms ? ms->m_header.h_nodeid : 0, lkb->lkb_remid,
- mstype, lkb->lkb_flags);
+ lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0,
+ lkb->lkb_remid, mstype, lkb->lkb_flags);
return -1;
out_del:
@@ -1585,6 +1546,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
log_error(ls, "remwait error %x reply %d wait_type %d overlap",
lkb->lkb_id, mstype, lkb->lkb_wait_type);
lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
lkb->lkb_wait_type = 0;
}
@@ -1617,10 +1579,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error;
- if (ms->m_flags != DLM_IFL_STUB_MS)
+ if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
mutex_lock(&ls->ls_waiters_mutex);
- error = _remove_from_waiters(lkb, ms->m_type, ms);
- if (ms->m_flags != DLM_IFL_STUB_MS)
+ error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms);
+ if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
mutex_unlock(&ls->ls_waiters_mutex);
return error;
}
@@ -1795,7 +1757,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
- wake_up(&ls->ls_remove_wait);
send_remove(r);
@@ -1804,6 +1765,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
ls->ls_remove_len = 0;
memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
+ wake_up(&ls->ls_remove_wait);
dlm_free_rsb(r);
}
@@ -1821,6 +1783,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls)
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
static void add_timeout(struct dlm_lkb *lkb)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
@@ -1866,7 +1829,7 @@ static void del_timeout(struct dlm_lkb *lkb)
void dlm_scan_timeout(struct dlm_ls *ls)
{
struct dlm_rsb *r;
- struct dlm_lkb *lkb;
+ struct dlm_lkb *lkb = NULL, *iter;
int do_cancel, do_warn;
s64 wait_us;
@@ -1877,27 +1840,28 @@ void dlm_scan_timeout(struct dlm_ls *ls)
do_cancel = 0;
do_warn = 0;
mutex_lock(&ls->ls_timeout_mutex);
- list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+ list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
wait_us = ktime_to_us(ktime_sub(ktime_get(),
- lkb->lkb_timestamp));
+ iter->lkb_timestamp));
- if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
- wait_us >= (lkb->lkb_timeout_cs * 10000))
+ if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
+ wait_us >= (iter->lkb_timeout_cs * 10000))
do_cancel = 1;
- if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
+ if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
wait_us >= dlm_config.ci_timewarn_cs * 10000)
do_warn = 1;
if (!do_cancel && !do_warn)
continue;
- hold_lkb(lkb);
+ hold_lkb(iter);
+ lkb = iter;
break;
}
mutex_unlock(&ls->ls_timeout_mutex);
- if (!do_cancel && !do_warn)
+ if (!lkb)
break;
r = lkb->lkb_resource;
@@ -1940,17 +1904,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
mutex_unlock(&ls->ls_timeout_mutex);
-
- if (!dlm_config.ci_waitwarn_us)
- return;
-
- mutex_lock(&ls->ls_waiters_mutex);
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (ktime_to_us(lkb->lkb_wait_time))
- lkb->lkb_wait_time = ktime_get();
- }
- mutex_unlock(&ls->ls_waiters_mutex);
}
+#else
+static void add_timeout(struct dlm_lkb *lkb) { }
+static void del_timeout(struct dlm_lkb *lkb) { }
+#endif
/* lkb is master or local copy */
@@ -2051,7 +2009,7 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
if (len > r->res_ls->ls_lvblen)
len = r->res_ls->ls_lvblen;
memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
- lkb->lkb_lvbseq = ms->m_lvbseq;
+ lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
}
}
@@ -2182,10 +2140,10 @@ static void munge_demoted(struct dlm_lkb *lkb)
static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- if (ms->m_type != DLM_MSG_REQUEST_REPLY &&
- ms->m_type != DLM_MSG_GRANT) {
+ if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) &&
+ ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) {
log_print("munge_altmode %x invalid reply type %d",
- lkb->lkb_id, ms->m_type);
+ lkb->lkb_id, le32_to_cpu(ms->m_type));
return;
}
@@ -2815,12 +2773,20 @@ static void confirm_master(struct dlm_rsb *r, int error)
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
int namelen, unsigned long timeout_cs,
void (*ast) (void *astparam),
void *astparam,
void (*bast) (void *astparam, int mode),
struct dlm_args *args)
+#else
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+ int namelen, void (*ast)(void *astparam),
+ void *astparam,
+ void (*bast)(void *astparam, int mode),
+ struct dlm_args *args)
+#endif
{
int rv = -EINVAL;
@@ -2873,7 +2839,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
args->astfn = ast;
args->astparam = astparam;
args->bastfn = bast;
+#ifdef CONFIG_DLM_DEPRECATED_API
args->timeout = timeout_cs;
+#endif
args->mode = mode;
args->lksb = lksb;
rv = 0;
@@ -2898,24 +2866,25 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
- int rv = -EINVAL;
+ int rv = -EBUSY;
if (args->flags & DLM_LKF_CONVERT) {
- if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
goto out;
- if (args->flags & DLM_LKF_QUECVT &&
- !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ /* lock not allowed if there's any op in progress */
+ if (lkb->lkb_wait_type || lkb->lkb_wait_count)
goto out;
- rv = -EBUSY;
- if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ if (is_overlap(lkb))
goto out;
- if (lkb->lkb_wait_type)
+ rv = -EINVAL;
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
goto out;
- if (is_overlap(lkb))
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
goto out;
}
@@ -2928,14 +2897,30 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
lkb->lkb_lksb = args->lksb;
lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
lkb->lkb_ownpid = (int) current->pid;
+#ifdef CONFIG_DLM_DEPRECATED_API
lkb->lkb_timeout_cs = args->timeout;
+#endif
rv = 0;
out:
- if (rv)
- log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
+ switch (rv) {
+ case 0:
+ break;
+ case -EINVAL:
+ /* annoy the user because dlm usage is wrong */
+ WARN_ON(1);
+ log_error(ls, "%s %d %x %x %x %d %d %s", __func__,
+ rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
+ lkb->lkb_status, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ default:
+ log_debug(ls, "%s %d %x %x %x %d %d %s", __func__,
rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
lkb->lkb_status, lkb->lkb_wait_type,
lkb->lkb_resource->res_name);
+ break;
+ }
+
return rv;
}
@@ -2949,23 +2934,12 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- int rv = -EINVAL;
-
- if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
- log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
- dlm_print_lkb(lkb);
- goto out;
- }
+ int rv = -EBUSY;
- /* an lkb may still exist even though the lock is EOL'ed due to a
- cancel, unlock or failed noqueue request; an app can't use these
- locks; return same error as if the lkid had not been found at all */
-
- if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
- log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
- rv = -ENOENT;
+ /* normal unlock not allowed if there's any op in progress */
+ if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) &&
+ (lkb->lkb_wait_type || lkb->lkb_wait_count))
goto out;
- }
/* an lkb may be waiting for an rsb lookup to complete where the
lookup was initiated by another lock */
@@ -2980,7 +2954,24 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
unhold_lkb(lkb); /* undoes create_lkb() */
}
/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
- rv = -EBUSY;
+ goto out;
+ }
+
+ rv = -EINVAL;
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+ log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+ dlm_print_lkb(lkb);
+ goto out;
+ }
+
+ /* an lkb may still exist even though the lock is EOL'ed due to a
+ * cancel, unlock or failed noqueue request; an app can't use these
+ * locks; return same error as if the lkid had not been found at all
+ */
+
+ if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+ log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+ rv = -ENOENT;
goto out;
}
@@ -3053,14 +3044,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
goto out;
}
/* add_to_waiters() will set OVERLAP_UNLOCK */
- goto out_ok;
}
- /* normal unlock not allowed if there's any op in progress */
- rv = -EBUSY;
- if (lkb->lkb_wait_type || lkb->lkb_wait_count)
- goto out;
-
out_ok:
/* an overlapping op shouldn't blow away exflags from other op */
lkb->lkb_exflags |= args->flags;
@@ -3068,11 +3053,25 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
lkb->lkb_astparam = args->astparam;
rv = 0;
out:
- if (rv)
- log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+ switch (rv) {
+ case 0:
+ break;
+ case -EINVAL:
+ /* annoy the user because dlm usage is wrong */
+ WARN_ON(1);
+ log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
args->flags, lkb->lkb_wait_type,
lkb->lkb_resource->res_name);
+ break;
+ default:
+ log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
+ lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+ args->flags, lkb->lkb_wait_type,
+ lkb->lkb_resource->res_name);
+ break;
+ }
+
return rv;
}
@@ -3323,8 +3322,9 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
* request_lock(), convert_lock(), unlock_lock(), cancel_lock()
*/
-static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
- int len, struct dlm_args *args)
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ const void *name, int len,
+ struct dlm_args *args)
{
struct dlm_rsb *r;
int error;
@@ -3423,7 +3423,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
int mode,
struct dlm_lksb *lksb,
uint32_t flags,
- void *name,
+ const void *name,
unsigned int namelen,
uint32_t parent_lkid,
void (*ast) (void *astarg),
@@ -3449,10 +3449,15 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error)
goto out;
- trace_dlm_lock_start(ls, lkb, mode, flags);
+ trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
astarg, bast, &args);
+#else
+ error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast,
+ &args);
+#endif
if (error)
goto out_put;
@@ -3464,7 +3469,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error == -EINPROGRESS)
error = 0;
out_put:
- trace_dlm_lock_end(ls, lkb, mode, flags, error);
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true);
if (convert || error)
__put_lkb(ls, lkb);
@@ -3563,13 +3568,13 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
ms = (struct dlm_message *) mb;
- ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- ms->m_header.u.h_lockspace = ls->ls_global_id;
- ms->m_header.h_nodeid = dlm_our_nodeid();
- ms->m_header.h_length = mb_len;
+ ms->m_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ ms->m_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
+ ms->m_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ ms->m_header.h_length = cpu_to_le16(mb_len);
ms->m_header.h_cmd = DLM_MSG;
- ms->m_type = mstype;
+ ms->m_type = cpu_to_le32(mstype);
*mh_ret = mh;
*ms_ret = ms;
@@ -3608,7 +3613,6 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
{
- dlm_message_out(ms);
dlm_midcomms_commit_mhandle(mh);
return 0;
}
@@ -3616,41 +3620,41 @@ static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
- ms->m_nodeid = lkb->lkb_nodeid;
- ms->m_pid = lkb->lkb_ownpid;
- ms->m_lkid = lkb->lkb_id;
- ms->m_remid = lkb->lkb_remid;
- ms->m_exflags = lkb->lkb_exflags;
- ms->m_sbflags = lkb->lkb_sbflags;
- ms->m_flags = lkb->lkb_flags;
- ms->m_lvbseq = lkb->lkb_lvbseq;
- ms->m_status = lkb->lkb_status;
- ms->m_grmode = lkb->lkb_grmode;
- ms->m_rqmode = lkb->lkb_rqmode;
- ms->m_hash = r->res_hash;
+ ms->m_nodeid = cpu_to_le32(lkb->lkb_nodeid);
+ ms->m_pid = cpu_to_le32(lkb->lkb_ownpid);
+ ms->m_lkid = cpu_to_le32(lkb->lkb_id);
+ ms->m_remid = cpu_to_le32(lkb->lkb_remid);
+ ms->m_exflags = cpu_to_le32(lkb->lkb_exflags);
+ ms->m_sbflags = cpu_to_le32(lkb->lkb_sbflags);
+ ms->m_flags = cpu_to_le32(lkb->lkb_flags);
+ ms->m_lvbseq = cpu_to_le32(lkb->lkb_lvbseq);
+ ms->m_status = cpu_to_le32(lkb->lkb_status);
+ ms->m_grmode = cpu_to_le32(lkb->lkb_grmode);
+ ms->m_rqmode = cpu_to_le32(lkb->lkb_rqmode);
+ ms->m_hash = cpu_to_le32(r->res_hash);
/* m_result and m_bastmode are set from function args,
not from lkb fields */
if (lkb->lkb_bastfn)
- ms->m_asts |= DLM_CB_BAST;
+ ms->m_asts |= cpu_to_le32(DLM_CB_BAST);
if (lkb->lkb_astfn)
- ms->m_asts |= DLM_CB_CAST;
+ ms->m_asts |= cpu_to_le32(DLM_CB_CAST);
/* compare with switch in create_message; send_remove() doesn't
use send_args() */
switch (ms->m_type) {
- case DLM_MSG_REQUEST:
- case DLM_MSG_LOOKUP:
+ case cpu_to_le32(DLM_MSG_REQUEST):
+ case cpu_to_le32(DLM_MSG_LOOKUP):
memcpy(ms->m_extra, r->res_name, r->res_length);
break;
- case DLM_MSG_CONVERT:
- case DLM_MSG_UNLOCK:
- case DLM_MSG_REQUEST_REPLY:
- case DLM_MSG_CONVERT_REPLY:
- case DLM_MSG_GRANT:
- if (!lkb->lkb_lvbptr)
+ case cpu_to_le32(DLM_MSG_CONVERT):
+ case cpu_to_le32(DLM_MSG_UNLOCK):
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
+ case cpu_to_le32(DLM_MSG_GRANT):
+ if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK))
break;
memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
break;
@@ -3699,8 +3703,8 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
/* down conversions go without a reply from the master */
if (!error && down_conversion(lkb)) {
remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
- r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
- r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
+ r->res_ls->ls_stub_ms.m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ r->res_ls->ls_stub_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
r->res_ls->ls_stub_ms.m_result = 0;
__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
}
@@ -3757,7 +3761,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
send_args(r, lkb, ms);
- ms->m_bastmode = mode;
+ ms->m_bastmode = cpu_to_le32(mode);
error = send_message(mh, ms);
out:
@@ -3805,7 +3809,7 @@ static int send_remove(struct dlm_rsb *r)
goto out;
memcpy(ms->m_extra, r->res_name, r->res_length);
- ms->m_hash = r->res_hash;
+ ms->m_hash = cpu_to_le32(r->res_hash);
error = send_message(mh, ms);
out:
@@ -3827,7 +3831,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
send_args(r, lkb, ms);
- ms->m_result = rv;
+ ms->m_result = cpu_to_le32(to_dlm_errno(rv));
error = send_message(mh, ms);
out:
@@ -3860,15 +3864,15 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
struct dlm_rsb *r = &ls->ls_stub_rsb;
struct dlm_message *ms;
struct dlm_mhandle *mh;
- int error, nodeid = ms_in->m_header.h_nodeid;
+ int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
if (error)
goto out;
ms->m_lkid = ms_in->m_lkid;
- ms->m_result = rv;
- ms->m_nodeid = ret_nodeid;
+ ms->m_result = cpu_to_le32(to_dlm_errno(rv));
+ ms->m_nodeid = cpu_to_le32(ret_nodeid);
error = send_message(mh, ms);
out:
@@ -3881,25 +3885,26 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- lkb->lkb_exflags = ms->m_exflags;
- lkb->lkb_sbflags = ms->m_sbflags;
+ lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
+ lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
- (ms->m_flags & 0x0000FFFF);
+ (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
}
static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- if (ms->m_flags == DLM_IFL_STUB_MS)
+ if (ms->m_flags == cpu_to_le32(DLM_IFL_STUB_MS))
return;
- lkb->lkb_sbflags = ms->m_sbflags;
+ lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
- (ms->m_flags & 0x0000FFFF);
+ (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
}
static int receive_extralen(struct dlm_message *ms)
{
- return (ms->m_header.h_length - sizeof(struct dlm_message));
+ return (le16_to_cpu(ms->m_header.h_length) -
+ sizeof(struct dlm_message));
}
static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
@@ -3933,14 +3938,14 @@ static void fake_astfn(void *astparam)
static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
- lkb->lkb_nodeid = ms->m_header.h_nodeid;
- lkb->lkb_ownpid = ms->m_pid;
- lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+ lkb->lkb_ownpid = le32_to_cpu(ms->m_pid);
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
lkb->lkb_grmode = DLM_LOCK_IV;
- lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
- lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
- lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
+ lkb->lkb_bastfn = (ms->m_asts & cpu_to_le32(DLM_CB_BAST)) ? &fake_bastfn : NULL;
+ lkb->lkb_astfn = (ms->m_asts & cpu_to_le32(DLM_CB_CAST)) ? &fake_astfn : NULL;
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
/* lkb was just created so there won't be an lvb yet */
@@ -3961,8 +3966,8 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (receive_lvb(ls, lkb, ms))
return -ENOMEM;
- lkb->lkb_rqmode = ms->m_rqmode;
- lkb->lkb_lvbseq = ms->m_lvbseq;
+ lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
+ lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
return 0;
}
@@ -3981,8 +3986,8 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb = &ls->ls_stub_lkb;
- lkb->lkb_nodeid = ms->m_header.h_nodeid;
- lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
}
/* This is called after the rsb is locked so that we can safely inspect
@@ -3990,11 +3995,12 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
{
- int from = ms->m_header.h_nodeid;
+ int from = le32_to_cpu(ms->m_header.h_nodeid);
int error = 0;
/* currently mixing of user/kernel locks are not supported */
- if (ms->m_flags & DLM_IFL_USER && ~lkb->lkb_flags & DLM_IFL_USER) {
+ if (ms->m_flags & cpu_to_le32(DLM_IFL_USER) &&
+ ~lkb->lkb_flags & DLM_IFL_USER) {
log_error(lkb->lkb_resource->res_ls,
"got user dlm message for a kernel lock");
error = -EINVAL;
@@ -4002,23 +4008,23 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
}
switch (ms->m_type) {
- case DLM_MSG_CONVERT:
- case DLM_MSG_UNLOCK:
- case DLM_MSG_CANCEL:
+ case cpu_to_le32(DLM_MSG_CONVERT):
+ case cpu_to_le32(DLM_MSG_UNLOCK):
+ case cpu_to_le32(DLM_MSG_CANCEL):
if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
error = -EINVAL;
break;
- case DLM_MSG_CONVERT_REPLY:
- case DLM_MSG_UNLOCK_REPLY:
- case DLM_MSG_CANCEL_REPLY:
- case DLM_MSG_GRANT:
- case DLM_MSG_BAST:
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
+ case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
+ case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
+ case cpu_to_le32(DLM_MSG_GRANT):
+ case cpu_to_le32(DLM_MSG_BAST):
if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
error = -EINVAL;
break;
- case DLM_MSG_REQUEST_REPLY:
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
if (!is_process_copy(lkb))
error = -EINVAL;
else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
@@ -4033,8 +4039,8 @@ out:
if (error)
log_error(lkb->lkb_resource->res_ls,
"ignore invalid message %d from %d %x %x %x %d",
- ms->m_type, from, lkb->lkb_id, lkb->lkb_remid,
- lkb->lkb_flags, lkb->lkb_nodeid);
+ le32_to_cpu(ms->m_type), from, lkb->lkb_id,
+ lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_nodeid);
return error;
}
@@ -4079,22 +4085,23 @@ static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
- wake_up(&ls->ls_remove_wait);
rv = _create_message(ls, sizeof(struct dlm_message) + len,
dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
if (rv)
- return;
+ goto out;
memcpy(ms->m_extra, name, len);
- ms->m_hash = hash;
+ ms->m_hash = cpu_to_le32(hash);
send_message(mh, ms);
+out:
spin_lock(&ls->ls_remove_spin);
ls->ls_remove_len = 0;
memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
+ wake_up(&ls->ls_remove_wait);
}
static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
@@ -4104,7 +4111,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
int from_nodeid;
int error, namelen = 0;
- from_nodeid = ms->m_header.h_nodeid;
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
error = create_lkb(ls, &lkb);
if (error)
@@ -4177,7 +4184,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
if (error != -ENOTBLK) {
log_limit(ls, "receive_request %x from %d %d",
- ms->m_lkid, from_nodeid, error);
+ le32_to_cpu(ms->m_lkid), from_nodeid, error);
}
if (namelen && error == -EBADR) {
@@ -4196,15 +4203,16 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error, reply = 1;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
goto fail;
- if (lkb->lkb_remid != ms->m_lkid) {
+ if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
log_error(ls, "receive_convert %x remid %x recover_seq %llu "
"remote %d %x", lkb->lkb_id, lkb->lkb_remid,
(unsigned long long)lkb->lkb_recover_seq,
- ms->m_header.h_nodeid, ms->m_lkid);
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid));
error = -ENOENT;
dlm_put_lkb(lkb);
goto fail;
@@ -4251,14 +4259,15 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
goto fail;
- if (lkb->lkb_remid != ms->m_lkid) {
+ if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
log_error(ls, "receive_unlock %x remid %x remote %d %x",
lkb->lkb_id, lkb->lkb_remid,
- ms->m_header.h_nodeid, ms->m_lkid);
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid));
error = -ENOENT;
dlm_put_lkb(lkb);
goto fail;
@@ -4302,7 +4311,7 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
goto fail;
@@ -4338,7 +4347,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4369,7 +4378,7 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_rsb *r;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4382,8 +4391,8 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
if (error)
goto out;
- queue_bast(r, lkb, ms->m_bastmode);
- lkb->lkb_highbast = ms->m_bastmode;
+ queue_bast(r, lkb, le32_to_cpu(ms->m_bastmode));
+ lkb->lkb_highbast = le32_to_cpu(ms->m_bastmode);
out:
unlock_rsb(r);
put_rsb(r);
@@ -4395,7 +4404,7 @@ static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
{
int len, error, ret_nodeid, from_nodeid, our_nodeid;
- from_nodeid = ms->m_header.h_nodeid;
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
our_nodeid = dlm_our_nodeid();
len = receive_extralen(ms);
@@ -4418,7 +4427,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
uint32_t hash, b;
int rv, len, dir_nodeid, from_nodeid;
- from_nodeid = ms->m_header.h_nodeid;
+ from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
len = receive_extralen(ms);
@@ -4428,7 +4437,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
return;
}
- dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+ dir_nodeid = dlm_hash2nodeid(ls, le32_to_cpu(ms->m_hash));
if (dir_nodeid != dlm_our_nodeid()) {
log_error(ls, "receive_remove from %d bad nodeid %d",
from_nodeid, dir_nodeid);
@@ -4501,7 +4510,7 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
{
- do_purge(ls, ms->m_nodeid, ms->m_pid);
+ do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid));
}
static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
@@ -4509,9 +4518,9 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
struct dlm_rsb *r;
int error, mstype, result;
- int from_nodeid = ms->m_header.h_nodeid;
+ int from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4527,7 +4536,8 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
if (error) {
log_error(ls, "receive_request_reply %x remote %d %x result %d",
- lkb->lkb_id, from_nodeid, ms->m_lkid, ms->m_result);
+ lkb->lkb_id, from_nodeid, le32_to_cpu(ms->m_lkid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
dlm_dump_rsb(r);
goto out;
}
@@ -4541,7 +4551,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
}
/* this is the value returned from do_request() on the master */
- result = ms->m_result;
+ result = from_dlm_errno(le32_to_cpu(ms->m_result));
switch (result) {
case -EAGAIN:
@@ -4555,7 +4565,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
case 0:
/* request was queued or granted on remote master */
receive_flags_reply(lkb, ms);
- lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
if (is_altmode(lkb))
munge_altmode(lkb, ms);
if (result) {
@@ -4628,7 +4638,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
struct dlm_message *ms)
{
/* this is the value returned from do_convert() on the master */
- switch (ms->m_result) {
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
case -EAGAIN:
/* convert would block (be queued) on remote master */
queue_cast(r, lkb, -EAGAIN);
@@ -4661,8 +4671,9 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
default:
log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d",
- lkb->lkb_id, ms->m_header.h_nodeid, ms->m_lkid,
- ms->m_result);
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
dlm_print_rsb(r);
dlm_print_lkb(lkb);
}
@@ -4696,7 +4707,7 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4724,7 +4735,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
/* this is the value returned from do_unlock() on the master */
- switch (ms->m_result) {
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
case -DLM_EUNLOCK:
receive_flags_reply(lkb, ms);
remove_lock_pc(r, lkb);
@@ -4734,7 +4745,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
break;
default:
log_error(r->res_ls, "receive_unlock_reply %x error %d",
- lkb->lkb_id, ms->m_result);
+ lkb->lkb_id, from_dlm_errno(le32_to_cpu(ms->m_result)));
}
out:
unlock_rsb(r);
@@ -4746,7 +4757,7 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4774,7 +4785,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
/* this is the value returned from do_cancel() on the master */
- switch (ms->m_result) {
+ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
case -DLM_ECANCEL:
receive_flags_reply(lkb, ms);
revert_lock_pc(r, lkb);
@@ -4784,7 +4795,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
break;
default:
log_error(r->res_ls, "receive_cancel_reply %x error %d",
- lkb->lkb_id, ms->m_result);
+ lkb->lkb_id,
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
}
out:
unlock_rsb(r);
@@ -4796,7 +4808,7 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
struct dlm_lkb *lkb;
int error;
- error = find_lkb(ls, ms->m_remid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
if (error)
return error;
@@ -4812,9 +4824,10 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
int error, ret_nodeid;
int do_lookup_list = 0;
- error = find_lkb(ls, ms->m_lkid, &lkb);
+ error = find_lkb(ls, le32_to_cpu(ms->m_lkid), &lkb);
if (error) {
- log_error(ls, "receive_lookup_reply no lkid %x", ms->m_lkid);
+ log_error(ls, "%s no lkid %x", __func__,
+ le32_to_cpu(ms->m_lkid));
return;
}
@@ -4829,7 +4842,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
if (error)
goto out;
- ret_nodeid = ms->m_nodeid;
+ ret_nodeid = le32_to_cpu(ms->m_nodeid);
/* We sometimes receive a request from the dir node for this
rsb before we've received the dir node's loookup_reply for it.
@@ -4841,8 +4854,8 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
/* This should never happen */
log_error(ls, "receive_lookup_reply %x from %d ret %d "
"master %d dir %d our %d first %x %s",
- lkb->lkb_id, ms->m_header.h_nodeid, ret_nodeid,
- r->res_master_nodeid, r->res_dir_nodeid,
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
+ ret_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
dlm_our_nodeid(), r->res_first_lkid, r->res_name);
}
@@ -4854,7 +4867,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
} else if (ret_nodeid == -1) {
/* the remote node doesn't believe it's the dir node */
log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid",
- lkb->lkb_id, ms->m_header.h_nodeid);
+ lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid));
r->res_master_nodeid = 0;
r->res_nodeid = -1;
lkb->lkb_nodeid = -1;
@@ -4888,10 +4901,12 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
{
int error = 0, noent = 0;
- if (!dlm_is_member(ls, ms->m_header.h_nodeid)) {
+ if (!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid))) {
log_limit(ls, "receive %d from non-member %d %x %x %d",
- ms->m_type, ms->m_header.h_nodeid, ms->m_lkid,
- ms->m_remid, ms->m_result);
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)));
return;
}
@@ -4899,77 +4914,78 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
/* messages sent to a master node */
- case DLM_MSG_REQUEST:
+ case cpu_to_le32(DLM_MSG_REQUEST):
error = receive_request(ls, ms);
break;
- case DLM_MSG_CONVERT:
+ case cpu_to_le32(DLM_MSG_CONVERT):
error = receive_convert(ls, ms);
break;
- case DLM_MSG_UNLOCK:
+ case cpu_to_le32(DLM_MSG_UNLOCK):
error = receive_unlock(ls, ms);
break;
- case DLM_MSG_CANCEL:
+ case cpu_to_le32(DLM_MSG_CANCEL):
noent = 1;
error = receive_cancel(ls, ms);
break;
/* messages sent from a master node (replies to above) */
- case DLM_MSG_REQUEST_REPLY:
+ case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
error = receive_request_reply(ls, ms);
break;
- case DLM_MSG_CONVERT_REPLY:
+ case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
error = receive_convert_reply(ls, ms);
break;
- case DLM_MSG_UNLOCK_REPLY:
+ case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
error = receive_unlock_reply(ls, ms);
break;
- case DLM_MSG_CANCEL_REPLY:
+ case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
error = receive_cancel_reply(ls, ms);
break;
/* messages sent from a master node (only two types of async msg) */
- case DLM_MSG_GRANT:
+ case cpu_to_le32(DLM_MSG_GRANT):
noent = 1;
error = receive_grant(ls, ms);
break;
- case DLM_MSG_BAST:
+ case cpu_to_le32(DLM_MSG_BAST):
noent = 1;
error = receive_bast(ls, ms);
break;
/* messages sent to a dir node */
- case DLM_MSG_LOOKUP:
+ case cpu_to_le32(DLM_MSG_LOOKUP):
receive_lookup(ls, ms);
break;
- case DLM_MSG_REMOVE:
+ case cpu_to_le32(DLM_MSG_REMOVE):
receive_remove(ls, ms);
break;
/* messages sent from a dir node (remove has no reply) */
- case DLM_MSG_LOOKUP_REPLY:
+ case cpu_to_le32(DLM_MSG_LOOKUP_REPLY):
receive_lookup_reply(ls, ms);
break;
/* other messages */
- case DLM_MSG_PURGE:
+ case cpu_to_le32(DLM_MSG_PURGE):
receive_purge(ls, ms);
break;
default:
- log_error(ls, "unknown message type %d", ms->m_type);
+ log_error(ls, "unknown message type %d",
+ le32_to_cpu(ms->m_type));
}
/*
@@ -4985,22 +5001,26 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
if (error == -ENOENT && noent) {
log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
- ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
- ms->m_lkid, saved_seq);
+ le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), saved_seq);
} else if (error == -ENOENT) {
log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
- ms->m_type, ms->m_remid, ms->m_header.h_nodeid,
- ms->m_lkid, saved_seq);
+ le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), saved_seq);
- if (ms->m_type == DLM_MSG_CONVERT)
- dlm_dump_rsb_hash(ls, ms->m_hash);
+ if (ms->m_type == cpu_to_le32(DLM_MSG_CONVERT))
+ dlm_dump_rsb_hash(ls, le32_to_cpu(ms->m_hash));
}
if (error == -EINVAL) {
log_error(ls, "receive %d inval from %d lkid %x remid %x "
"saved_seq %u",
- ms->m_type, ms->m_header.h_nodeid,
- ms->m_lkid, ms->m_remid, saved_seq);
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ saved_seq);
}
}
@@ -5021,7 +5041,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
lockspace generation before we left. */
if (!ls->ls_generation) {
log_limit(ls, "receive %d from %d ignore old gen",
- ms->m_type, nodeid);
+ le32_to_cpu(ms->m_type), nodeid);
return;
}
@@ -5054,30 +5074,30 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
switch (hd->h_cmd) {
case DLM_MSG:
- dlm_message_in(&p->message);
- type = p->message.m_type;
+ type = le32_to_cpu(p->message.m_type);
break;
case DLM_RCOM:
- dlm_rcom_in(&p->rcom);
- type = p->rcom.rc_type;
+ type = le32_to_cpu(p->rcom.rc_type);
break;
default:
log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
return;
}
- if (hd->h_nodeid != nodeid) {
+ if (le32_to_cpu(hd->h_nodeid) != nodeid) {
log_print("invalid h_nodeid %d from %d lockspace %x",
- hd->h_nodeid, nodeid, hd->u.h_lockspace);
+ le32_to_cpu(hd->h_nodeid), nodeid,
+ le32_to_cpu(hd->u.h_lockspace));
return;
}
- ls = dlm_find_lockspace_global(hd->u.h_lockspace);
+ ls = dlm_find_lockspace_global(le32_to_cpu(hd->u.h_lockspace));
if (!ls) {
if (dlm_config.ci_log_debug) {
printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
"%u from %d cmd %d type %d\n",
- hd->u.h_lockspace, nodeid, hd->h_cmd, type);
+ le32_to_cpu(hd->u.h_lockspace), nodeid,
+ hd->h_cmd, type);
}
if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
@@ -5091,8 +5111,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
down_read(&ls->ls_recv_active);
if (hd->h_cmd == DLM_MSG)
dlm_receive_message(ls, &p->message, nodeid);
- else
+ else if (hd->h_cmd == DLM_RCOM)
dlm_receive_rcom(ls, &p->rcom, nodeid);
+ else
+ log_error(ls, "invalid h_cmd %d from %d lockspace %x",
+ hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace));
up_read(&ls->ls_recv_active);
dlm_put_lockspace(ls);
@@ -5104,10 +5127,10 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
if (middle_conversion(lkb)) {
hold_lkb(lkb);
memset(ms_stub, 0, sizeof(struct dlm_message));
- ms_stub->m_flags = DLM_IFL_STUB_MS;
- ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
- ms_stub->m_result = -EINPROGRESS;
- ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ ms_stub->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
+ ms_stub->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
+ ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
_receive_convert_reply(lkb, ms_stub);
/* Same special case as in receive_rcom_lock_args() */
@@ -5226,10 +5249,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
case DLM_MSG_UNLOCK:
hold_lkb(lkb);
memset(ms_stub, 0, sizeof(struct dlm_message));
- ms_stub->m_flags = DLM_IFL_STUB_MS;
- ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
- ms_stub->m_result = stub_unlock_result;
- ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ ms_stub->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
+ ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_unlock_result));
+ ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
_receive_unlock_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
@@ -5237,10 +5260,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
case DLM_MSG_CANCEL:
hold_lkb(lkb);
memset(ms_stub, 0, sizeof(struct dlm_message));
- ms_stub->m_flags = DLM_IFL_STUB_MS;
- ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
- ms_stub->m_result = stub_cancel_result;
- ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
+ ms_stub->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
+ ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_cancel_result));
+ ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
_receive_cancel_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
@@ -5257,21 +5280,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
{
- struct dlm_lkb *lkb;
- int found = 0;
+ struct dlm_lkb *lkb = NULL, *iter;
mutex_lock(&ls->ls_waiters_mutex);
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (lkb->lkb_flags & DLM_IFL_RESEND) {
- hold_lkb(lkb);
- found = 1;
+ list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) {
+ if (iter->lkb_flags & DLM_IFL_RESEND) {
+ hold_lkb(iter);
+ lkb = iter;
break;
}
}
mutex_unlock(&ls->ls_waiters_mutex);
- if (!found)
- lkb = NULL;
return lkb;
}
@@ -5331,11 +5351,16 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
lkb->lkb_wait_type = 0;
- lkb->lkb_wait_count = 0;
+ /* drop all wait_count references we still
+ * hold a reference for this iteration.
+ */
+ while (lkb->lkb_wait_count) {
+ lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
+ }
mutex_lock(&ls->ls_waiters_mutex);
list_del_init(&lkb->lkb_wait_reply);
mutex_unlock(&ls->ls_waiters_mutex);
- unhold_lkb(lkb); /* for waiters list */
if (oc || ou) {
/* do an unlock or cancel instead of resending */
@@ -5605,7 +5630,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
- lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+ lkb->lkb_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
@@ -5620,8 +5645,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
- int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
- sizeof(struct rcom_lock);
+ int lvblen = le16_to_cpu(rc->rc_header.h_length) -
+ sizeof(struct dlm_rcom) - sizeof(struct rcom_lock);
if (lvblen > ls->ls_lvblen)
return -EINVAL;
lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
@@ -5657,7 +5682,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
struct dlm_rsb *r;
struct dlm_lkb *lkb;
uint32_t remid = 0;
- int from_nodeid = rc->rc_header.h_nodeid;
+ int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
int error;
if (rl->rl_parent_lkid) {
@@ -5707,7 +5732,6 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
attach_lkb(r, lkb);
add_lkb(r, lkb, rl->rl_status);
- error = 0;
ls->ls_recover_locks_in++;
if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
@@ -5747,7 +5771,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
error = find_lkb(ls, lkid, &lkb);
if (error) {
log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
return error;
}
@@ -5757,7 +5782,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
if (!is_process_copy(lkb)) {
log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
dlm_dump_rsb(r);
unlock_rsb(r);
put_rsb(r);
@@ -5772,7 +5798,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
a barrier between recover_masters and recover_locks. */
log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
dlm_send_rcom_lock(r, lkb);
goto out;
@@ -5782,7 +5809,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
break;
default:
log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk",
- lkid, rc->rc_header.h_nodeid, remid, result);
+ lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
+ result);
}
/* an ack for dlm_recover_locks() which waits for replies from
@@ -5796,12 +5824,18 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
return 0;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
int mode, uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs)
+#else
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+ int mode, uint32_t flags, void *name, unsigned int namelen)
+#endif
{
struct dlm_lkb *lkb;
struct dlm_args args;
+ bool do_put = true;
int error;
dlm_lock_recovery(ls);
@@ -5812,23 +5846,28 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
goto out;
}
+ trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+
if (flags & DLM_LKF_VALBLK) {
ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
if (!ua->lksb.sb_lvbptr) {
kfree(ua);
- __put_lkb(ls, lkb);
error = -ENOMEM;
- goto out;
+ goto out_put;
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
fake_astfn, ua, fake_bastfn, &args);
+#else
+ error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua,
+ fake_bastfn, &args);
+#endif
if (error) {
kfree(ua->lksb.sb_lvbptr);
ua->lksb.sb_lvbptr = NULL;
kfree(ua);
- __put_lkb(ls, lkb);
- goto out;
+ goto out_put;
}
/* After ua is attached to lkb it will be freed by dlm_free_lkb().
@@ -5847,8 +5886,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
error = 0;
fallthrough;
default:
- __put_lkb(ls, lkb);
- goto out;
+ goto out_put;
}
/* add this new lkb to the per-process list of locks */
@@ -5856,14 +5894,24 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
hold_lkb(lkb);
list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
spin_unlock(&ua->proc->locks_spin);
+ do_put = false;
+ out_put:
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false);
+ if (do_put)
+ __put_lkb(ls, lkb);
out:
dlm_unlock_recovery(ls);
return error;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
unsigned long timeout_cs)
+#else
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+#endif
{
struct dlm_lkb *lkb;
struct dlm_args args;
@@ -5876,6 +5924,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error)
goto out;
+ trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags);
+
/* user can change the params on its lock when it converts it, or
add an lvb that didn't exist before */
@@ -5898,8 +5948,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
ua->bastaddr = ua_tmp->bastaddr;
ua->user_lksb = ua_tmp->user_lksb;
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
fake_astfn, ua, fake_bastfn, &args);
+#else
+ error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua,
+ fake_bastfn, &args);
+#endif
if (error)
goto out_put;
@@ -5908,6 +5963,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
error = 0;
out_put:
+ trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -5923,39 +5979,38 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
- unsigned long timeout_cs, uint32_t *lkid)
+ uint32_t *lkid)
{
- struct dlm_lkb *lkb;
+ struct dlm_lkb *lkb = NULL, *iter;
struct dlm_user_args *ua;
int found_other_mode = 0;
- int found = 0;
int rv = 0;
mutex_lock(&ls->ls_orphans_mutex);
- list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) {
- if (lkb->lkb_resource->res_length != namelen)
+ list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) {
+ if (iter->lkb_resource->res_length != namelen)
continue;
- if (memcmp(lkb->lkb_resource->res_name, name, namelen))
+ if (memcmp(iter->lkb_resource->res_name, name, namelen))
continue;
- if (lkb->lkb_grmode != mode) {
+ if (iter->lkb_grmode != mode) {
found_other_mode = 1;
continue;
}
- found = 1;
- list_del_init(&lkb->lkb_ownqueue);
- lkb->lkb_flags &= ~DLM_IFL_ORPHAN;
- *lkid = lkb->lkb_id;
+ lkb = iter;
+ list_del_init(&iter->lkb_ownqueue);
+ iter->lkb_flags &= ~DLM_IFL_ORPHAN;
+ *lkid = iter->lkb_id;
break;
}
mutex_unlock(&ls->ls_orphans_mutex);
- if (!found && found_other_mode) {
+ if (!lkb && found_other_mode) {
rv = -EAGAIN;
goto out;
}
- if (!found) {
+ if (!lkb) {
rv = -ENOENT;
goto out;
}
@@ -6001,6 +6056,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
ua = lkb->lkb_ua;
if (lvb_in && ua->lksb.sb_lvbptr)
@@ -6029,6 +6086,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
spin_unlock(&ua->proc->locks_spin);
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -6050,6 +6108,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
ua = lkb->lkb_ua;
if (ua_tmp->castparam)
ua->castparam = ua_tmp->castparam;
@@ -6067,6 +6127,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
if (error == -EBUSY)
error = 0;
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -6088,6 +6149,8 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
if (error)
goto out;
+ trace_dlm_unlock_start(ls, lkb, flags);
+
ua = lkb->lkb_ua;
error = set_unlock_args(flags, ua, &args);
@@ -6116,6 +6179,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
if (error == -EBUSY)
error = 0;
out_put:
+ trace_dlm_unlock_end(ls, lkb, flags, error);
dlm_put_lkb(lkb);
out:
dlm_unlock_recovery(ls);
@@ -6171,7 +6235,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
{
struct dlm_lkb *lkb = NULL;
- mutex_lock(&ls->ls_clear_proc_locks);
+ spin_lock(&ls->ls_clear_proc_locks);
if (list_empty(&proc->locks))
goto out;
@@ -6183,7 +6247,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
else
lkb->lkb_flags |= DLM_IFL_DEAD;
out:
- mutex_unlock(&ls->ls_clear_proc_locks);
+ spin_unlock(&ls->ls_clear_proc_locks);
return lkb;
}
@@ -6220,7 +6284,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
dlm_put_lkb(lkb);
}
- mutex_lock(&ls->ls_clear_proc_locks);
+ spin_lock(&ls->ls_clear_proc_locks);
/* in-progress unlocks */
list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
@@ -6236,7 +6300,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
dlm_put_lkb(lkb);
}
- mutex_unlock(&ls->ls_clear_proc_locks);
+ spin_unlock(&ls->ls_clear_proc_locks);
dlm_unlock_recovery(ls);
}
@@ -6307,8 +6371,8 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
DLM_MSG_PURGE, &ms, &mh);
if (error)
return error;
- ms->m_nodeid = nodeid;
- ms->m_pid = pid;
+ ms->m_nodeid = cpu_to_le32(nodeid);
+ ms->m_pid = cpu_to_le32(pid);
return send_message(mh, ms);
}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 252a5898f908..40c76b5544da 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,13 +24,19 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_lock_recovery_try(struct dlm_ls *ls);
void dlm_unlock_recovery(struct dlm_ls *ls);
-void dlm_scan_waiters(struct dlm_ls *ls);
+
+#ifdef CONFIG_DLM_DEPRECATED_API
void dlm_scan_timeout(struct dlm_ls *ls);
void dlm_adjust_timeouts(struct dlm_ls *ls);
+#else
+static inline void dlm_scan_timeout(struct dlm_ls *ls) { }
+static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { }
+#endif
+
int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
unsigned int flags, int *r_nodeid, int *result);
-int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
struct dlm_rsb **r_ret);
void dlm_recover_purge(struct dlm_ls *ls);
@@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls);
int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs);
int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
unsigned long timeout_cs);
+#else
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+ uint32_t flags, void *name, unsigned int namelen);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+#endif
int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
- unsigned long timeout_cs, uint32_t *lkid);
+ uint32_t *lkid);
int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
uint32_t flags, uint32_t lkid, char *lvb_in);
int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 0d3833a124a3..bae050df7abf 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -275,7 +275,6 @@ static int dlm_scand(void *data)
ls->ls_scan_time = jiffies;
dlm_scan_rsbs(ls);
dlm_scan_timeout(ls);
- dlm_scan_waiters(ls);
dlm_unlock_recovery(ls);
} else {
ls->ls_scan_time += HZ;
@@ -417,7 +416,7 @@ static int new_lockspace(const char *name, const char *cluster,
if (namelen > DLM_LOCKSPACE_LEN || namelen == 0)
return -EINVAL;
- if (!lvblen || (lvblen % 8))
+ if (lvblen % 8)
return -EINVAL;
if (!try_module_get(THIS_MODULE))
@@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster,
ls->ls_ops_arg = ops_arg;
}
- if (flags & DLM_LSFL_TIMEWARN)
+#ifdef CONFIG_DLM_DEPRECATED_API
+ if (flags & DLM_LSFL_TIMEWARN) {
+ pr_warn_once("===============================================================\n"
+ "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n"
+ " will be removed in v6.2!\n"
+ " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n"
+ "===============================================================\n");
+
set_bit(LSFL_TIMEWARN, &ls->ls_flags);
+ }
/* ls_exflags are forced to match among nodes, and we don't
- need to require all nodes to have some flags set */
+ * need to require all nodes to have some flags set
+ */
ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
DLM_LSFL_NEWEXCL));
+#else
+ /* ls_exflags are forced to match among nodes, and we don't
+ * need to require all nodes to have some flags set
+ */
+ ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL));
+#endif
size = READ_ONCE(dlm_config.ci_rsbtbl_size);
ls->ls_rsbtbl_size = size;
@@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster,
mutex_init(&ls->ls_waiters_mutex);
INIT_LIST_HEAD(&ls->ls_orphans);
mutex_init(&ls->ls_orphans_mutex);
+#ifdef CONFIG_DLM_DEPRECATED_API
INIT_LIST_HEAD(&ls->ls_timeout);
mutex_init(&ls->ls_timeout_mutex);
+#endif
INIT_LIST_HEAD(&ls->ls_new_rsb);
spin_lock_init(&ls->ls_new_rsb_spin);
@@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster,
init_waitqueue_head(&ls->ls_uevent_wait);
ls->ls_uevent_result = 0;
- init_completion(&ls->ls_members_done);
- ls->ls_members_result = -1;
+ init_completion(&ls->ls_recovery_done);
+ ls->ls_recovery_result = -1;
mutex_init(&ls->ls_cb_mutex);
INIT_LIST_HEAD(&ls->ls_cb_delay);
@@ -568,7 +584,7 @@ static int new_lockspace(const char *name, const char *cluster,
atomic_set(&ls->ls_requestqueue_cnt, 0);
init_waitqueue_head(&ls->ls_requestqueue_wait);
mutex_init(&ls->ls_requestqueue_mutex);
- mutex_init(&ls->ls_clear_proc_locks);
+ spin_lock_init(&ls->ls_clear_proc_locks);
/* Due backwards compatibility with 3.1 we need to use maximum
* possible dlm message size to be sure the message will fit and
@@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster,
if (error)
goto out_recoverd;
- wait_for_completion(&ls->ls_members_done);
- error = ls->ls_members_result;
+ /* wait until recovery is successful or failed */
+ wait_for_completion(&ls->ls_recovery_done);
+ error = ls->ls_recovery_result;
if (error)
goto out_members;
@@ -686,10 +703,11 @@ static int new_lockspace(const char *name, const char *cluster,
return error;
}
-int dlm_new_lockspace(const char *name, const char *cluster,
- uint32_t flags, int lvblen,
- const struct dlm_lockspace_ops *ops, void *ops_arg,
- int *ops_result, dlm_lockspace_t **lockspace)
+static int __dlm_new_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
{
int error = 0;
@@ -715,6 +733,25 @@ int dlm_new_lockspace(const char *name, const char *cluster,
return error;
}
+int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags,
+ int lvblen, const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
+{
+ return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen,
+ ops, ops_arg, ops_result, lockspace);
+}
+
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace)
+{
+ return __dlm_new_lockspace(name, cluster, flags, lvblen, ops,
+ ops_arg, ops_result, lockspace);
+}
+
static int lkb_idr_is_local(int id, void *p, void *data)
{
struct dlm_lkb *lkb = p;
@@ -922,3 +959,15 @@ void dlm_stop_lockspaces(void)
log_print("dlm user daemon left %d lockspaces", count);
}
+void dlm_stop_lockspaces_check(void)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (WARN_ON(!rwsem_is_locked(&ls->ls_in_recovery) ||
+ !dlm_locking_stopped(ls)))
+ break;
+ }
+ spin_unlock(&lslist_lock);
+}
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index a78d853b9342..03f4a4a3a871 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -12,6 +12,14 @@
#ifndef __LOCKSPACE_DOT_H__
#define __LOCKSPACE_DOT_H__
+/* DLM_LSFL_FS
+ * The lockspace user is in the kernel (i.e. filesystem). Enables
+ * direct bast/cast callbacks.
+ *
+ * internal lockspace flag - will be removed in future
+ */
+#define DLM_LSFL_FS 0x00000004
+
int dlm_lockspace_init(void);
void dlm_lockspace_exit(void);
struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
@@ -19,6 +27,12 @@ struct dlm_ls *dlm_find_lockspace_local(void *id);
struct dlm_ls *dlm_find_lockspace_device(int minor);
void dlm_put_lockspace(struct dlm_ls *ls);
void dlm_stop_lockspaces(void);
+void dlm_stop_lockspaces_check(void);
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+ uint32_t flags, int lvblen,
+ const struct dlm_lockspace_ops *ops,
+ void *ops_arg, int *ops_result,
+ dlm_lockspace_t **lockspace);
#endif /* __LOCKSPACE_DOT_H__ */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index e284d696c1fd..59f64c596233 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk)
return;
if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
- log_print("successful connected to node %d", con->nodeid);
+ log_print("connected to node %d", con->nodeid);
queue_work(send_workqueue, &con->swork);
return;
}
@@ -1303,6 +1303,10 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
return msg;
}
+/* avoid false positive for nodes_srcu, unlock happens in
+ * dlm_lowcomms_commit_msg which is a must call if success
+ */
+#ifndef __CHECKER__
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
char **ppc, void (*cb)(void *data),
void *data)
@@ -1332,10 +1336,13 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
return NULL;
}
+ /* for dlm_lowcomms_commit_msg() */
+ kref_get(&msg->ref);
/* we assume if successful commit must called */
msg->idx = idx;
return msg;
}
+#endif
static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
{
@@ -1362,11 +1369,18 @@ out:
return;
}
+/* avoid false positive for nodes_srcu, lock was happen in
+ * dlm_lowcomms_new_msg
+ */
+#ifndef __CHECKER__
void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
{
_dlm_lowcomms_commit_msg(msg);
srcu_read_unlock(&connections_srcu, msg->idx);
+ /* because dlm_lowcomms_new_msg() */
+ kref_put(&msg->ref, dlm_msg_release);
}
+#endif
void dlm_lowcomms_put_msg(struct dlm_msg *msg)
{
@@ -1789,7 +1803,7 @@ static int dlm_listen_for_all(void)
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
log_print("Can't create comms socket: %d", result);
- goto out;
+ return result;
}
sock_set_mark(sock->sk, dlm_config.ci_mark);
@@ -1921,7 +1935,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
return ret;
if (!test_and_set_bit(CF_CONNECTED, &con->flags))
- log_print("successful connected to node %d", con->nodeid);
+ log_print("connected to node %d", con->nodeid);
return 0;
}
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 61f906e705db..2af2ccfe43a9 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -20,7 +20,7 @@
int dlm_slots_version(struct dlm_header *h)
{
- if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
+ if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS)
return 0;
return 1;
}
@@ -120,18 +120,13 @@ int dlm_slots_copy_in(struct dlm_ls *ls)
ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
- for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
- ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
- ro->ro_slot = le16_to_cpu(ro->ro_slot);
- }
-
log_slots(ls, gen, num_slots, ro0, NULL, 0);
list_for_each_entry(memb, &ls->ls_nodes, list) {
for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
- if (ro->ro_nodeid != memb->nodeid)
+ if (le32_to_cpu(ro->ro_nodeid) != memb->nodeid)
continue;
- memb->slot = ro->ro_slot;
+ memb->slot = le16_to_cpu(ro->ro_slot);
memb->slot_prev = memb->slot;
break;
}
@@ -539,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
int i, error, neg = 0, low = -1;
/* previously removed members that we've not finished removing need to
- count as a negative change so the "neg" recovery steps will happen */
+ * count as a negative change so the "neg" recovery steps will happen
+ *
+ * This functionality must report all member changes to lsops or
+ * midcomms layer and must never return before.
+ */
list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
log_rinfo(ls, "prev removed member %d", memb->nodeid);
@@ -588,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
*neg_out = neg;
error = ping_members(ls);
- /* error -EINTR means that a new recovery action is triggered.
- * We ignore this recovery action and let run the new one which might
- * have new member configuration.
- */
- if (error == -EINTR)
- error = 0;
-
- /* new_lockspace() may be waiting to know if the config
- * is good or bad
- */
- ls->ls_members_result = error;
- complete(&ls->ls_members_done);
-
log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
@@ -680,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
if (!ls->ls_recover_begin)
ls->ls_recover_begin = jiffies;
- dlm_lsop_recover_prep(ls);
+ /* call recover_prep ops only once and not multiple times
+ * for each possible dlm_ls_stop() when recovery is already
+ * stopped.
+ *
+ * If we successful was able to clear LSFL_RUNNING bit and
+ * it was set we know it is the first dlm_ls_stop() call.
+ */
+ if (new)
+ dlm_lsop_recover_prep(ls);
+
return 0;
}
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 3635e42b0669..6489bc22ad61 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -135,6 +135,7 @@
#include <net/tcp.h>
#include "dlm_internal.h"
+#include "lockspace.h"
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
@@ -380,13 +381,12 @@ static int dlm_send_ack(int nodeid, uint32_t seq)
m_header = (struct dlm_header *)ppc;
- m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- m_header->h_nodeid = dlm_our_nodeid();
- m_header->h_length = mb_len;
+ m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ m_header->h_length = cpu_to_le16(mb_len);
m_header->h_cmd = DLM_ACK;
- m_header->u.h_seq = seq;
+ m_header->u.h_seq = cpu_to_le32(seq);
- header_out(m_header);
dlm_lowcomms_commit_msg(msg);
dlm_lowcomms_put_msg(msg);
@@ -409,13 +409,11 @@ static int dlm_send_fin(struct midcomms_node *node,
m_header = (struct dlm_header *)ppc;
- m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- m_header->h_nodeid = dlm_our_nodeid();
- m_header->h_length = mb_len;
+ m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ m_header->h_length = cpu_to_le16(mb_len);
m_header->h_cmd = DLM_FIN;
- header_out(m_header);
-
pr_debug("sending fin msg to node %d\n", node->nodeid);
dlm_midcomms_commit_mhandle(mh);
set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
@@ -574,14 +572,14 @@ dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p,
return NULL;
}
- switch (le32_to_cpu(p->rcom.rc_type)) {
- case DLM_RCOM_NAMES:
+ switch (p->rcom.rc_type) {
+ case cpu_to_le32(DLM_RCOM_NAMES):
fallthrough;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
fallthrough;
- case DLM_RCOM_STATUS:
+ case cpu_to_le32(DLM_RCOM_STATUS):
fallthrough;
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
node = nodeid2node(nodeid, 0);
if (node) {
spin_lock(&node->state_lock);
@@ -741,14 +739,14 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
*
* length already checked.
*/
- switch (le32_to_cpu(p->rcom.rc_type)) {
- case DLM_RCOM_NAMES:
+ switch (p->rcom.rc_type) {
+ case cpu_to_le32(DLM_RCOM_NAMES):
fallthrough;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
fallthrough;
- case DLM_RCOM_STATUS:
+ case cpu_to_le32(DLM_RCOM_STATUS):
fallthrough;
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
break;
default:
log_print("unsupported rcom type received: %u, will skip this message from node %d",
@@ -1020,11 +1018,10 @@ static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len,
uint32_t seq)
{
opts->o_header.h_cmd = DLM_OPTS;
- opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- opts->o_header.h_nodeid = dlm_our_nodeid();
- opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len;
- opts->o_header.u.h_seq = seq;
- header_out(&opts->o_header);
+ opts->o_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ opts->o_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ opts->o_header.h_length = cpu_to_le16(DLM_MIDCOMMS_OPT_LEN + inner_len);
+ opts->o_header.u.h_seq = cpu_to_le32(seq);
}
static void midcomms_new_msg_cb(void *data)
@@ -1062,6 +1059,10 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node
return msg;
}
+/* avoid false positive for nodes_srcu, unlock happens in
+ * dlm_midcomms_commit_mhandle which is a must call if success
+ */
+#ifndef __CHECKER__
struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
gfp_t allocation, char **ppc)
{
@@ -1127,6 +1128,7 @@ err:
srcu_read_unlock(&nodes_srcu, idx);
return NULL;
}
+#endif
static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
{
@@ -1136,6 +1138,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
dlm_lowcomms_commit_msg(mh->msg);
}
+/* avoid false positive for nodes_srcu, lock was happen in
+ * dlm_midcomms_get_mhandle
+ */
+#ifndef __CHECKER__
void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
{
switch (mh->node->version) {
@@ -1157,6 +1163,7 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
break;
}
}
+#endif
int dlm_midcomms_start(void)
{
@@ -1406,6 +1413,8 @@ int dlm_midcomms_close(int nodeid)
if (nodeid == dlm_our_nodeid())
return 0;
+ dlm_stop_lockspaces_check();
+
idx = srcu_read_lock(&nodes_srcu);
/* Abort pending close/remove operation */
node = nodeid2node(nodeid, 0);
@@ -1455,7 +1464,7 @@ static void midcomms_new_rawmsg_cb(void *data)
switch (h->h_cmd) {
case DLM_OPTS:
if (!h->u.h_seq)
- h->u.h_seq = rd->node->seq_send++;
+ h->u.h_seq = cpu_to_le32(rd->node->seq_send++);
break;
default:
break;
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 67f68d48d60c..4de4b8651c6c 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -75,6 +75,7 @@ static struct genl_family family __ro_after_init = {
.version = DLM_GENL_VERSION,
.small_ops = dlm_nl_ops,
.n_small_ops = ARRAY_SIZE(dlm_nl_ops),
+ .resv_start_op = DLM_CMD_HELLO + 1,
.module = THIS_MODULE,
};
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index c38b2b8ffd1d..737f185aad8d 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -13,26 +13,28 @@
#include "dlm_internal.h"
#include "lockspace.h"
-static spinlock_t ops_lock;
-static struct list_head send_list;
-static struct list_head recv_list;
-static wait_queue_head_t send_wq;
-static wait_queue_head_t recv_wq;
+static DEFINE_SPINLOCK(ops_lock);
+static LIST_HEAD(send_list);
+static LIST_HEAD(recv_list);
+static DECLARE_WAIT_QUEUE_HEAD(send_wq);
+static DECLARE_WAIT_QUEUE_HEAD(recv_wq);
-struct plock_op {
- struct list_head list;
- int done;
- struct dlm_plock_info info;
-};
-
-struct plock_xop {
- struct plock_op xop;
- int (*callback)(struct file_lock *fl, int result);
+struct plock_async_data {
void *fl;
void *file;
struct file_lock flc;
+ int (*callback)(struct file_lock *fl, int result);
};
+struct plock_op {
+ struct list_head list;
+ int done;
+ /* if lock op got interrupted while waiting dlm_controld reply */
+ bool sigint;
+ struct dlm_plock_info info;
+ /* if set indicates async handling */
+ struct plock_async_data *data;
+};
static inline void set_version(struct dlm_plock_info *info)
{
@@ -58,10 +60,15 @@ static int check_version(struct dlm_plock_info *info)
return 0;
}
+static void dlm_release_plock_op(struct plock_op *op)
+{
+ kfree(op->data);
+ kfree(op);
+}
+
static void send_op(struct plock_op *op)
{
set_version(&op->info);
- INIT_LIST_HEAD(&op->list);
spin_lock(&ops_lock);
list_add_tail(&op->list, &send_list);
spin_unlock(&ops_lock);
@@ -74,8 +81,7 @@ static void send_op(struct plock_op *op)
abandoned waiter. So, we have to insert the unlock-close when the
lock call is interrupted. */
-static void do_unlock_close(struct dlm_ls *ls, u64 number,
- struct file *file, struct file_lock *fl)
+static void do_unlock_close(const struct dlm_plock_info *info)
{
struct plock_op *op;
@@ -84,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number,
return;
op->info.optype = DLM_PLOCK_OP_UNLOCK;
- op->info.pid = fl->fl_pid;
- op->info.fsid = ls->ls_global_id;
- op->info.number = number;
+ op->info.pid = info->pid;
+ op->info.fsid = info->fsid;
+ op->info.number = info->number;
op->info.start = 0;
op->info.end = OFFSET_MAX;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = info->owner;
op->info.flags |= DLM_PLOCK_FL_CLOSE;
send_op(op);
@@ -101,22 +104,21 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number,
int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
int cmd, struct file_lock *fl)
{
+ struct plock_async_data *op_data;
struct dlm_ls *ls;
struct plock_op *op;
- struct plock_xop *xop;
int rv;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
- xop = kzalloc(sizeof(*xop), GFP_NOFS);
- if (!xop) {
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op) {
rv = -ENOMEM;
goto out;
}
- op = &xop->xop;
op->info.optype = DLM_PLOCK_OP_LOCK;
op->info.pid = fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
@@ -125,46 +127,57 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
+ /* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data) {
+ dlm_release_plock_op(op);
+ rv = -ENOMEM;
+ goto out;
+ }
+
/* fl_owner is lockd which doesn't distinguish
processes on the nfs client */
op->info.owner = (__u64) fl->fl_pid;
- xop->callback = fl->fl_lmops->lm_grant;
- locks_init_lock(&xop->flc);
- locks_copy_lock(&xop->flc, fl);
- xop->fl = fl;
- xop->file = file;
+ op_data->callback = fl->fl_lmops->lm_grant;
+ locks_init_lock(&op_data->flc);
+ locks_copy_lock(&op_data->flc, fl);
+ op_data->fl = fl;
+ op_data->file = file;
+
+ op->data = op_data;
+
+ send_op(op);
+ rv = FILE_LOCK_DEFERRED;
+ goto out;
} else {
op->info.owner = (__u64)(long) fl->fl_owner;
- xop->callback = NULL;
}
send_op(op);
- if (xop->callback == NULL) {
- rv = wait_event_interruptible(recv_wq, (op->done != 0));
- if (rv == -ERESTARTSYS) {
- log_debug(ls, "dlm_posix_lock: wait killed %llx",
- (unsigned long long)number);
- spin_lock(&ops_lock);
- list_del(&op->list);
+ rv = wait_event_interruptible(recv_wq, (op->done != 0));
+ if (rv == -ERESTARTSYS) {
+ spin_lock(&ops_lock);
+ /* recheck under ops_lock if we got a done != 0,
+ * if so this interrupt case should be ignored
+ */
+ if (op->done != 0) {
spin_unlock(&ops_lock);
- kfree(xop);
- do_unlock_close(ls, number, file, fl);
- goto out;
+ goto do_lock_wait;
}
- } else {
- rv = FILE_LOCK_DEFERRED;
+
+ op->sigint = true;
+ spin_unlock(&ops_lock);
+ log_debug(ls, "%s: wait interrupted %x %llx pid %d",
+ __func__, ls->ls_global_id,
+ (unsigned long long)number, op->info.pid);
goto out;
}
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_error(ls, "dlm_posix_lock: op on list %llx",
- (unsigned long long)number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+do_lock_wait:
+
+ WARN_ON(!list_empty(&op->list));
rv = op->info.rv;
@@ -174,7 +187,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
(unsigned long long)number);
}
- kfree(xop);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
return rv;
@@ -184,26 +197,20 @@ EXPORT_SYMBOL_GPL(dlm_posix_lock);
/* Returns failure iff a successful lock operation should be canceled */
static int dlm_plock_callback(struct plock_op *op)
{
+ struct plock_async_data *op_data = op->data;
struct file *file;
struct file_lock *fl;
struct file_lock *flc;
int (*notify)(struct file_lock *fl, int result) = NULL;
- struct plock_xop *xop = (struct plock_xop *)op;
int rv = 0;
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_print("dlm_plock_callback: op on list %llx",
- (unsigned long long)op->info.number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+ WARN_ON(!list_empty(&op->list));
/* check if the following 2 are still valid or make a copy */
- file = xop->file;
- flc = &xop->flc;
- fl = xop->fl;
- notify = xop->callback;
+ file = op_data->file;
+ flc = &op_data->flc;
+ fl = op_data->fl;
+ notify = op_data->callback;
if (op->info.rv) {
notify(fl, op->info.rv);
@@ -234,7 +241,7 @@ static int dlm_plock_callback(struct plock_op *op)
}
out:
- kfree(xop);
+ dlm_release_plock_op(op);
return rv;
}
@@ -290,13 +297,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
wait_event(recv_wq, (op->done != 0));
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_error(ls, "dlm_posix_unlock: op on list %llx",
- (unsigned long long)number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+ WARN_ON(!list_empty(&op->list));
rv = op->info.rv;
@@ -304,7 +305,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = 0;
out_free:
- kfree(op);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
fl->fl_flags = fl_flags;
@@ -344,13 +345,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
wait_event(recv_wq, (op->done != 0));
- spin_lock(&ops_lock);
- if (!list_empty(&op->list)) {
- log_error(ls, "dlm_posix_get: op on list %llx",
- (unsigned long long)number);
- list_del(&op->list);
- }
- spin_unlock(&ops_lock);
+ WARN_ON(!list_empty(&op->list));
/* info.rv from userspace is 1 for conflict, 0 for no-conflict,
-ENOENT if there are no locks on the file */
@@ -370,7 +365,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = 0;
}
- kfree(op);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
return rv;
@@ -389,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
spin_lock(&ops_lock);
if (!list_empty(&send_list)) {
- op = list_entry(send_list.next, struct plock_op, list);
+ op = list_first_entry(&send_list, struct plock_op, list);
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
list_del(&op->list);
else
@@ -406,7 +401,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
(the process did not make an unlock call). */
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
- kfree(op);
+ dlm_release_plock_op(op);
if (copy_to_user(u, &info, sizeof(info)))
return -EFAULT;
@@ -418,9 +413,9 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
loff_t *ppos)
{
+ struct plock_op *op = NULL, *iter;
struct dlm_plock_info info;
- struct plock_op *op;
- int found = 0, do_callback = 0;
+ int do_callback = 0;
if (count != sizeof(info))
return -EINVAL;
@@ -432,31 +427,43 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
return -EINVAL;
spin_lock(&ops_lock);
- list_for_each_entry(op, &recv_list, list) {
- if (op->info.fsid == info.fsid &&
- op->info.number == info.number &&
- op->info.owner == info.owner) {
- struct plock_xop *xop = (struct plock_xop *)op;
- list_del_init(&op->list);
- memcpy(&op->info, &info, sizeof(info));
- if (xop->callback)
+ list_for_each_entry(iter, &recv_list, list) {
+ if (iter->info.fsid == info.fsid &&
+ iter->info.number == info.number &&
+ iter->info.owner == info.owner) {
+ if (iter->sigint) {
+ list_del(&iter->list);
+ spin_unlock(&ops_lock);
+
+ pr_debug("%s: sigint cleanup %x %llx pid %d",
+ __func__, iter->info.fsid,
+ (unsigned long long)iter->info.number,
+ iter->info.pid);
+ do_unlock_close(&iter->info);
+ memcpy(&iter->info, &info, sizeof(info));
+ dlm_release_plock_op(iter);
+ return count;
+ }
+ list_del_init(&iter->list);
+ memcpy(&iter->info, &info, sizeof(info));
+ if (iter->data)
do_callback = 1;
else
- op->done = 1;
- found = 1;
+ iter->done = 1;
+ op = iter;
break;
}
}
spin_unlock(&ops_lock);
- if (found) {
+ if (op) {
if (do_callback)
dlm_plock_callback(op);
else
wake_up(&recv_wq);
} else
- log_print("dev_write no op %x %llx", info.fsid,
- (unsigned long long)info.number);
+ log_print("%s: no op %x %llx", __func__,
+ info.fsid, (unsigned long long)info.number);
return count;
}
@@ -492,12 +499,6 @@ int dlm_plock_init(void)
{
int rv;
- spin_lock_init(&ops_lock);
- INIT_LIST_HEAD(&send_list);
- INIT_LIST_HEAD(&recv_list);
- init_waitqueue_head(&send_wq);
- init_waitqueue_head(&recv_wq);
-
rv = misc_register(&plock_dev_misc);
if (rv)
log_print("dlm_plock_init: misc_register failed %d", rv);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 5821b777a1a7..f19860315043 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -34,16 +34,16 @@ static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
rc = (struct dlm_rcom *) mb;
- rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- rc->rc_header.u.h_lockspace = ls->ls_global_id;
- rc->rc_header.h_nodeid = dlm_our_nodeid();
- rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
+ rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ rc->rc_header.h_length = cpu_to_le16(mb_len);
rc->rc_header.h_cmd = DLM_RCOM;
- rc->rc_type = type;
+ rc->rc_type = cpu_to_le32(type);
spin_lock(&ls->ls_recover_lock);
- rc->rc_seq = ls->ls_recover_seq;
+ rc->rc_seq = cpu_to_le64(ls->ls_recover_seq);
spin_unlock(&ls->ls_recover_lock);
*rc_ret = rc;
@@ -91,13 +91,11 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
{
- dlm_rcom_out(rc);
dlm_midcomms_commit_mhandle(mh);
}
static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
{
- dlm_rcom_out(rc);
dlm_lowcomms_commit_msg(msg);
dlm_lowcomms_put_msg(msg);
}
@@ -127,10 +125,10 @@ static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
{
struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
- if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+ if ((le32_to_cpu(rc->rc_header.h_version) & 0xFFFF0000) != DLM_HEADER_MAJOR) {
log_error(ls, "version mismatch: %x nodeid %d: %x",
DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
- rc->rc_header.h_version);
+ le32_to_cpu(rc->rc_header.h_version));
return -EPROTO;
}
@@ -145,10 +143,10 @@ static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
return 0;
}
-static void allow_sync_reply(struct dlm_ls *ls, uint64_t *new_seq)
+static void allow_sync_reply(struct dlm_ls *ls, __le64 *new_seq)
{
spin_lock(&ls->ls_rcom_spin);
- *new_seq = ++ls->ls_rcom_seq;
+ *new_seq = cpu_to_le64(++ls->ls_rcom_seq);
set_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
spin_unlock(&ls->ls_rcom_spin);
}
@@ -182,7 +180,7 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
if (nodeid == dlm_our_nodeid()) {
rc = ls->ls_recover_buf;
- rc->rc_result = dlm_recover_status(ls);
+ rc->rc_result = cpu_to_le32(dlm_recover_status(ls));
goto out;
}
@@ -208,7 +206,7 @@ retry:
rc = ls->ls_recover_buf;
- if (rc->rc_result == -ESRCH) {
+ if (rc->rc_result == cpu_to_le32(-ESRCH)) {
/* we pretend the remote lockspace exists with 0 status */
log_debug(ls, "remote node %d not ready", nodeid);
rc->rc_result = 0;
@@ -227,7 +225,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
struct dlm_rcom *rc;
struct rcom_status *rs;
uint32_t status;
- int nodeid = rc_in->rc_header.h_nodeid;
+ int nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
int len = sizeof(struct rcom_config);
struct dlm_msg *msg;
int num_slots = 0;
@@ -259,7 +257,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- rc->rc_result = status;
+ rc->rc_result = cpu_to_le32(status);
set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
@@ -287,14 +285,16 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
spin_lock(&ls->ls_rcom_spin);
if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
- rc_in->rc_id != ls->ls_rcom_seq) {
+ le64_to_cpu(rc_in->rc_id) != ls->ls_rcom_seq) {
log_debug(ls, "reject reply %d from %d seq %llx expect %llx",
- rc_in->rc_type, rc_in->rc_header.h_nodeid,
- (unsigned long long)rc_in->rc_id,
+ le32_to_cpu(rc_in->rc_type),
+ le32_to_cpu(rc_in->rc_header.h_nodeid),
+ (unsigned long long)le64_to_cpu(rc_in->rc_id),
(unsigned long long)ls->ls_rcom_seq);
goto out;
}
- memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+ memcpy(ls->ls_recover_buf, rc_in,
+ le16_to_cpu(rc_in->rc_header.h_length));
set_bit(LSFL_RCOM_READY, &ls->ls_flags);
clear_bit(LSFL_RCOM_WAIT, &ls->ls_flags);
wake_up(&ls->ls_wait_general);
@@ -336,8 +336,9 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
int error, inlen, outlen, nodeid;
struct dlm_msg *msg;
- nodeid = rc_in->rc_header.h_nodeid;
- inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+ nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+ inlen = le16_to_cpu(rc_in->rc_header.h_length) -
+ sizeof(struct dlm_rcom);
outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom);
error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen,
@@ -364,7 +365,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
if (error)
goto out;
memcpy(rc->rc_buf, r->res_name, r->res_length);
- rc->rc_id = (unsigned long) r->res_id;
+ rc->rc_id = cpu_to_le64(r->res_id);
send_rcom(mh, rc);
out:
@@ -375,11 +376,12 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
- int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
- int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+ int error, ret_nodeid, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
+ int len = le16_to_cpu(rc_in->rc_header.h_length) -
+ sizeof(struct dlm_rcom);
/* Old code would send this special id to trigger a debug dump. */
- if (rc_in->rc_id == 0xFFFFFFFF) {
+ if (rc_in->rc_id == cpu_to_le64(0xFFFFFFFF)) {
log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
dlm_dump_rsb_name(ls, rc_in->rc_buf, len);
return;
@@ -393,7 +395,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL);
if (error)
ret_nodeid = error;
- rc->rc_result = ret_nodeid;
+ rc->rc_result = cpu_to_le32(ret_nodeid);
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
@@ -452,7 +454,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
rl = (struct rcom_lock *) rc->rc_buf;
pack_rcom_lock(r, lkb, rl);
- rc->rc_id = (unsigned long) r;
+ rc->rc_id = cpu_to_le64((uintptr_t)r);
send_rcom(mh, rc);
out:
@@ -464,7 +466,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
- int error, nodeid = rc_in->rc_header.h_nodeid;
+ int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
dlm_recover_master_copy(ls, rc_in);
@@ -500,21 +502,20 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
rc = (struct dlm_rcom *) mb;
- rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace;
- rc->rc_header.h_nodeid = dlm_our_nodeid();
- rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
+ rc->rc_header.h_length = cpu_to_le16(mb_len);
rc->rc_header.h_cmd = DLM_RCOM;
- rc->rc_type = DLM_RCOM_STATUS_REPLY;
+ rc->rc_type = cpu_to_le32(DLM_RCOM_STATUS_REPLY);
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
- rc->rc_result = -ESRCH;
+ rc->rc_result = cpu_to_le32(-ESRCH);
rf = (struct rcom_config *) rc->rc_buf;
rf->rf_lvblen = cpu_to_le32(~0U);
- dlm_rcom_out(rc);
dlm_midcomms_commit_mhandle(mh);
return 0;
@@ -573,27 +574,27 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
uint64_t seq;
switch (rc->rc_type) {
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
reply = 1;
break;
- case DLM_RCOM_NAMES:
+ case cpu_to_le32(DLM_RCOM_NAMES):
names = 1;
break;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
names = 1;
reply = 1;
break;
- case DLM_RCOM_LOOKUP:
+ case cpu_to_le32(DLM_RCOM_LOOKUP):
lookup = 1;
break;
- case DLM_RCOM_LOOKUP_REPLY:
+ case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY):
lookup = 1;
reply = 1;
break;
- case DLM_RCOM_LOCK:
+ case cpu_to_le32(DLM_RCOM_LOCK):
lock = 1;
break;
- case DLM_RCOM_LOCK_REPLY:
+ case cpu_to_le32(DLM_RCOM_LOCK_REPLY):
lock = 1;
reply = 1;
break;
@@ -605,10 +606,10 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
seq = ls->ls_recover_seq;
spin_unlock(&ls->ls_recover_lock);
- if (stop && (rc->rc_type != DLM_RCOM_STATUS))
+ if (stop && (rc->rc_type != cpu_to_le32(DLM_RCOM_STATUS)))
goto ignore;
- if (reply && (rc->rc_seq_reply != seq))
+ if (reply && (le64_to_cpu(rc->rc_seq_reply) != seq))
goto ignore;
if (!(status & DLM_RS_NODES) && (names || lookup || lock))
@@ -618,59 +619,60 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
goto ignore;
switch (rc->rc_type) {
- case DLM_RCOM_STATUS:
+ case cpu_to_le32(DLM_RCOM_STATUS):
receive_rcom_status(ls, rc);
break;
- case DLM_RCOM_NAMES:
+ case cpu_to_le32(DLM_RCOM_NAMES):
receive_rcom_names(ls, rc);
break;
- case DLM_RCOM_LOOKUP:
+ case cpu_to_le32(DLM_RCOM_LOOKUP):
receive_rcom_lookup(ls, rc);
break;
- case DLM_RCOM_LOCK:
- if (rc->rc_header.h_length < lock_size)
+ case cpu_to_le32(DLM_RCOM_LOCK):
+ if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
goto Eshort;
receive_rcom_lock(ls, rc);
break;
- case DLM_RCOM_STATUS_REPLY:
+ case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
receive_sync_reply(ls, rc);
break;
- case DLM_RCOM_NAMES_REPLY:
+ case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
receive_sync_reply(ls, rc);
break;
- case DLM_RCOM_LOOKUP_REPLY:
+ case cpu_to_le32(DLM_RCOM_LOOKUP_REPLY):
receive_rcom_lookup_reply(ls, rc);
break;
- case DLM_RCOM_LOCK_REPLY:
- if (rc->rc_header.h_length < lock_size)
+ case cpu_to_le32(DLM_RCOM_LOCK_REPLY):
+ if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
goto Eshort;
dlm_recover_process_copy(ls, rc);
break;
default:
- log_error(ls, "receive_rcom bad type %d", rc->rc_type);
+ log_error(ls, "receive_rcom bad type %d",
+ le32_to_cpu(rc->rc_type));
}
return;
ignore:
log_limit(ls, "dlm_receive_rcom ignore msg %d "
"from %d %llu %llu recover seq %llu sts %x gen %u",
- rc->rc_type,
+ le32_to_cpu(rc->rc_type),
nodeid,
- (unsigned long long)rc->rc_seq,
- (unsigned long long)rc->rc_seq_reply,
+ (unsigned long long)le64_to_cpu(rc->rc_seq),
+ (unsigned long long)le64_to_cpu(rc->rc_seq_reply),
(unsigned long long)seq,
status, ls->ls_generation);
return;
Eshort:
log_error(ls, "recovery message %d from %d is too short",
- rc->rc_type, nodeid);
+ le32_to_cpu(rc->rc_type), nodeid);
}
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 8928e99dfd47..ccff1791803f 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -114,7 +114,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
if (save_slots)
dlm_slot_save(ls, rc, memb);
- if (rc->rc_result & wait_status)
+ if (le32_to_cpu(rc->rc_result) & wait_status)
break;
if (delay < 1000)
delay += 20;
@@ -141,7 +141,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
if (error)
break;
- if (rc->rc_result & wait_status)
+ if (le32_to_cpu(rc->rc_result) & wait_status)
break;
if (delay < 1000)
delay += 20;
@@ -568,14 +568,14 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
struct dlm_rsb *r;
int ret_nodeid, new_master;
- r = recover_idr_find(ls, rc->rc_id);
+ r = recover_idr_find(ls, le64_to_cpu(rc->rc_id));
if (!r) {
log_error(ls, "dlm_recover_master_reply no id %llx",
- (unsigned long long)rc->rc_id);
+ (unsigned long long)le64_to_cpu(rc->rc_id));
goto out;
}
- ret_nodeid = rc->rc_result;
+ ret_nodeid = le32_to_cpu(rc->rc_result);
if (ret_nodeid == dlm_our_nodeid())
new_master = 0;
@@ -732,10 +732,9 @@ void dlm_recovered_lock(struct dlm_rsb *r)
static void recover_lvb(struct dlm_rsb *r)
{
- struct dlm_lkb *lkb, *high_lkb = NULL;
+ struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL;
uint32_t high_seq = 0;
int lock_lvb_exists = 0;
- int big_lock_exists = 0;
int lvblen = r->res_ls->ls_lvblen;
if (!rsb_flag(r, RSB_NEW_MASTER2) &&
@@ -751,37 +750,37 @@ static void recover_lvb(struct dlm_rsb *r)
/* we are the new master, so figure out if VALNOTVALID should
be set, and set the rsb lvb from the best lkb available. */
- list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
- if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = 1;
- if (lkb->lkb_grmode > DLM_LOCK_CR) {
- big_lock_exists = 1;
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
goto setflag;
}
- if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
- high_lkb = lkb;
- high_seq = lkb->lkb_lvbseq;
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
}
}
- list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
- if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = 1;
- if (lkb->lkb_grmode > DLM_LOCK_CR) {
- big_lock_exists = 1;
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
goto setflag;
}
- if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
- high_lkb = lkb;
- high_seq = lkb->lkb_lvbseq;
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
}
}
@@ -790,7 +789,7 @@ static void recover_lvb(struct dlm_rsb *r)
goto out;
/* lvb is invalidated if only NL/CR locks remain */
- if (!big_lock_exists)
+ if (!big_lkb)
rsb_set_flag(r, RSB_VALNOTVALID);
if (!r->res_lvbptr) {
@@ -799,9 +798,9 @@ static void recover_lvb(struct dlm_rsb *r)
goto out;
}
- if (big_lock_exists) {
- r->res_lvbseq = lkb->lkb_lvbseq;
- memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+ if (big_lkb) {
+ r->res_lvbseq = big_lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen);
} else if (high_lkb) {
r->res_lvbseq = high_lkb->lkb_lvbseq;
memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index a55dfce705dd..e15eb511b04b 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
/*
* Add or remove nodes from the lockspace's ls_nodes list.
+ *
+ * Due to the fact that we must report all membership changes to lsops
+ * or midcomms layer, it is not permitted to abort ls_recover() until
+ * this is done.
*/
error = dlm_recover_members(ls, rv, &neg);
@@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);
- dlm_lsop_recover_done(ls);
return 0;
fail:
dlm_release_root_list(ls);
- log_rinfo(ls, "dlm_recover %llu error %d",
- (unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);
+
return error;
}
@@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
static void do_ls_recovery(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL;
+ int error;
spin_lock(&ls->ls_recover_lock);
rv = ls->ls_recover_args;
@@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls)
spin_unlock(&ls->ls_recover_lock);
if (rv) {
- ls_recover(ls, rv);
+ error = ls_recover(ls, rv);
+ switch (error) {
+ case 0:
+ ls->ls_recovery_result = 0;
+ complete(&ls->ls_recovery_done);
+
+ dlm_lsop_recover_done(ls);
+ break;
+ case -EINTR:
+ /* if recovery was interrupted -EINTR we wait for the next
+ * ls_recover() iteration until it hopefully succeeds.
+ */
+ log_rinfo(ls, "%s %llu interrupted and should be queued to run again",
+ __func__, (unsigned long long)rv->seq);
+ break;
+ default:
+ log_rinfo(ls, "%s %llu error %d", __func__,
+ (unsigned long long)rv->seq, error);
+
+ /* let new_lockspace() get aware of critical error */
+ ls->ls_recovery_result = error;
+ complete(&ls->ls_recovery_done);
+ break;
+ }
+
kfree(rv->nodes);
kfree(rv);
}
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index ccb5307c21e9..036a9a0078f6 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -14,6 +14,7 @@
#include "dir.h"
#include "config.h"
#include "requestqueue.h"
+#include "util.h"
struct rq_entry {
struct list_head list;
@@ -32,7 +33,8 @@ struct rq_entry {
void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
{
struct rq_entry *e;
- int length = ms->m_header.h_length - sizeof(struct dlm_message);
+ int length = le16_to_cpu(ms->m_header.h_length) -
+ sizeof(struct dlm_message);
e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
if (!e) {
@@ -42,7 +44,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
e->nodeid = nodeid;
- memcpy(&e->request, ms, ms->m_header.h_length);
+ memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length));
atomic_inc(&ls->ls_requestqueue_cnt);
mutex_lock(&ls->ls_requestqueue_mutex);
@@ -82,8 +84,10 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
log_limit(ls, "dlm_process_requestqueue msg %d from %d "
"lkid %x remid %x result %d seq %u",
- ms->m_type, ms->m_header.h_nodeid,
- ms->m_lkid, ms->m_remid, ms->m_result,
+ le32_to_cpu(ms->m_type),
+ le32_to_cpu(ms->m_header.h_nodeid),
+ le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
+ from_dlm_errno(le32_to_cpu(ms->m_result)),
e->recover_seq);
dlm_receive_message_saved(ls, &e->request, e->recover_seq);
@@ -124,7 +128,7 @@ void dlm_wait_requestqueue(struct dlm_ls *ls)
static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
{
- uint32_t type = ms->m_type;
+ __le32 type = ms->m_type;
/* the ls is being cleaned up and freed by release_lockspace */
if (!atomic_read(&ls->ls_count))
@@ -136,9 +140,9 @@ static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
/* directory operations are always purged because the directory is
always rebuilt during recovery and the lookups resent */
- if (type == DLM_MSG_REMOVE ||
- type == DLM_MSG_LOOKUP ||
- type == DLM_MSG_LOOKUP_REPLY)
+ if (type == cpu_to_le32(DLM_MSG_REMOVE) ||
+ type == cpu_to_le32(DLM_MSG_LOOKUP) ||
+ type == cpu_to_le32(DLM_MSG_LOOKUP_REPLY))
return 1;
if (!dlm_no_directory(ls))
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index e5cefa90b1ce..c5d27bccc3dc 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -16,6 +16,8 @@
#include <linux/slab.h>
#include <linux/sched/signal.h>
+#include <trace/events/dlm.h>
+
#include "dlm_internal.h"
#include "lockspace.h"
#include "lock.h"
@@ -108,11 +110,11 @@ static void compat_input(struct dlm_write_request *kb,
kb->i.lock.parent = kb32->i.lock.parent;
kb->i.lock.xid = kb32->i.lock.xid;
kb->i.lock.timeout = kb32->i.lock.timeout;
- kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
- kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
- kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
- kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
- kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+ kb->i.lock.castparam = (__user void *)(long)kb32->i.lock.castparam;
+ kb->i.lock.castaddr = (__user void *)(long)kb32->i.lock.castaddr;
+ kb->i.lock.bastparam = (__user void *)(long)kb32->i.lock.bastparam;
+ kb->i.lock.bastaddr = (__user void *)(long)kb32->i.lock.bastaddr;
+ kb->i.lock.lksb = (__user void *)(long)kb32->i.lock.lksb;
memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
}
@@ -127,9 +129,9 @@ static void compat_output(struct dlm_lock_result *res,
res32->version[1] = res->version[1];
res32->version[2] = res->version[2];
- res32->user_astaddr = (__u32)(long)res->user_astaddr;
- res32->user_astparam = (__u32)(long)res->user_astparam;
- res32->user_lksb = (__u32)(long)res->user_lksb;
+ res32->user_astaddr = (__u32)(__force long)res->user_astaddr;
+ res32->user_astparam = (__u32)(__force long)res->user_astparam;
+ res32->user_lksb = (__u32)(__force long)res->user_lksb;
res32->bast_mode = res->bast_mode;
res32->lvb_offset = res->lvb_offset;
@@ -184,7 +186,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
return;
ls = lkb->lkb_resource->res_ls;
- mutex_lock(&ls->ls_clear_proc_locks);
+ spin_lock(&ls->ls_clear_proc_locks);
/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
@@ -230,7 +232,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
spin_unlock(&proc->locks_spin);
}
out:
- mutex_unlock(&ls->ls_clear_proc_locks);
+ spin_unlock(&ls->ls_clear_proc_locks);
}
static int device_user_lock(struct dlm_user_proc *proc,
@@ -250,6 +252,14 @@ static int device_user_lock(struct dlm_user_proc *proc,
goto out;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
+ if (params->timeout)
+ pr_warn_once("========================================================\n"
+ "WARNING: the lkb timeout feature is being deprecated and\n"
+ " will be removed in v6.2!\n"
+ "========================================================\n");
+#endif
+
ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
if (!ua)
goto out;
@@ -262,23 +272,34 @@ static int device_user_lock(struct dlm_user_proc *proc,
ua->xid = params->xid;
if (params->flags & DLM_LKF_CONVERT) {
+#ifdef CONFIG_DLM_DEPRECATED_API
error = dlm_user_convert(ls, ua,
params->mode, params->flags,
params->lkid, params->lvb,
(unsigned long) params->timeout);
+#else
+ error = dlm_user_convert(ls, ua,
+ params->mode, params->flags,
+ params->lkid, params->lvb);
+#endif
} else if (params->flags & DLM_LKF_ORPHAN) {
error = dlm_user_adopt_orphan(ls, ua,
params->mode, params->flags,
params->name, params->namelen,
- (unsigned long) params->timeout,
&lkid);
if (!error)
error = lkid;
} else {
+#ifdef CONFIG_DLM_DEPRECATED_API
error = dlm_user_request(ls, ua,
params->mode, params->flags,
params->name, params->namelen,
(unsigned long) params->timeout);
+#else
+ error = dlm_user_request(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen);
+#endif
if (!error)
error = ua->lksb.sb_lkid;
}
@@ -402,9 +423,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags,
- DLM_USER_LVB_LEN, NULL, NULL, NULL,
- &lockspace);
+ error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name,
+ params->flags, DLM_USER_LVB_LEN, NULL,
+ NULL, NULL, &lockspace);
if (error)
return error;
@@ -863,7 +884,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
goto try_another;
}
- if (cb.flags & DLM_CB_CAST) {
+ if (cb.flags & DLM_CB_BAST) {
+ trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode);
+ } else if (cb.flags & DLM_CB_CAST) {
new_mode = cb.mode;
if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
@@ -872,6 +895,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
lkb->lkb_lksb->sb_status = cb.sb_status;
lkb->lkb_lksb->sb_flags = cb.sb_flags;
+ trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
}
rv = copy_result_to_user(lkb->lkb_ua,
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 58acbcc2081a..f2bc401f312f 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -20,28 +20,10 @@
#define DLM_ERRNO_ETIMEDOUT 110
#define DLM_ERRNO_EINPROGRESS 115
-void header_out(struct dlm_header *hd)
-{
- hd->h_version = cpu_to_le32(hd->h_version);
- /* does it for others u32 in union as well */
- hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace);
- hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
- hd->h_length = cpu_to_le16(hd->h_length);
-}
-
-void header_in(struct dlm_header *hd)
-{
- hd->h_version = le32_to_cpu(hd->h_version);
- /* does it for others u32 in union as well */
- hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace);
- hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
- hd->h_length = le16_to_cpu(hd->h_length);
-}
-
/* higher errno values are inconsistent across architectures, so select
one set of values for on the wire */
-static int to_dlm_errno(int err)
+int to_dlm_errno(int err)
{
switch (err) {
case -EDEADLK:
@@ -62,7 +44,7 @@ static int to_dlm_errno(int err)
return err;
}
-static int from_dlm_errno(int err)
+int from_dlm_errno(int err)
{
switch (err) {
case -DLM_ERRNO_EDEADLK:
@@ -82,73 +64,3 @@ static int from_dlm_errno(int err)
}
return err;
}
-
-void dlm_message_out(struct dlm_message *ms)
-{
- header_out(&ms->m_header);
-
- ms->m_type = cpu_to_le32(ms->m_type);
- ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
- ms->m_pid = cpu_to_le32(ms->m_pid);
- ms->m_lkid = cpu_to_le32(ms->m_lkid);
- ms->m_remid = cpu_to_le32(ms->m_remid);
- ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
- ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
- ms->m_exflags = cpu_to_le32(ms->m_exflags);
- ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
- ms->m_flags = cpu_to_le32(ms->m_flags);
- ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
- ms->m_hash = cpu_to_le32(ms->m_hash);
- ms->m_status = cpu_to_le32(ms->m_status);
- ms->m_grmode = cpu_to_le32(ms->m_grmode);
- ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
- ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
- ms->m_asts = cpu_to_le32(ms->m_asts);
- ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result));
-}
-
-void dlm_message_in(struct dlm_message *ms)
-{
- header_in(&ms->m_header);
-
- ms->m_type = le32_to_cpu(ms->m_type);
- ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
- ms->m_pid = le32_to_cpu(ms->m_pid);
- ms->m_lkid = le32_to_cpu(ms->m_lkid);
- ms->m_remid = le32_to_cpu(ms->m_remid);
- ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
- ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
- ms->m_exflags = le32_to_cpu(ms->m_exflags);
- ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
- ms->m_flags = le32_to_cpu(ms->m_flags);
- ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
- ms->m_hash = le32_to_cpu(ms->m_hash);
- ms->m_status = le32_to_cpu(ms->m_status);
- ms->m_grmode = le32_to_cpu(ms->m_grmode);
- ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
- ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
- ms->m_asts = le32_to_cpu(ms->m_asts);
- ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result));
-}
-
-void dlm_rcom_out(struct dlm_rcom *rc)
-{
- header_out(&rc->rc_header);
-
- rc->rc_type = cpu_to_le32(rc->rc_type);
- rc->rc_result = cpu_to_le32(rc->rc_result);
- rc->rc_id = cpu_to_le64(rc->rc_id);
- rc->rc_seq = cpu_to_le64(rc->rc_seq);
- rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply);
-}
-
-void dlm_rcom_in(struct dlm_rcom *rc)
-{
- header_in(&rc->rc_header);
-
- rc->rc_type = le32_to_cpu(rc->rc_type);
- rc->rc_result = le32_to_cpu(rc->rc_result);
- rc->rc_id = le64_to_cpu(rc->rc_id);
- rc->rc_seq = le64_to_cpu(rc->rc_seq);
- rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply);
-}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
index d46f23c7a6a0..b6a4b8adca8d 100644
--- a/fs/dlm/util.h
+++ b/fs/dlm/util.h
@@ -11,12 +11,8 @@
#ifndef __UTIL_DOT_H__
#define __UTIL_DOT_H__
-void dlm_message_out(struct dlm_message *ms);
-void dlm_message_in(struct dlm_message *ms);
-void dlm_rcom_out(struct dlm_rcom *rc);
-void dlm_rcom_in(struct dlm_rcom *rc);
-void header_out(struct dlm_header *hd);
-void header_in(struct dlm_header *hd);
+int to_dlm_errno(int err);
+int from_dlm_errno(int err);
#endif
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5f2b49e13731..f2ed0c0266cb 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -506,7 +506,7 @@ ecryptfs_dentry_to_lower(struct dentry *dentry)
return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry;
}
-static inline struct path *
+static inline const struct path *
ecryptfs_dentry_to_lower_path(struct dentry *dentry)
{
return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 18d5b91cb573..268b74499c28 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -33,7 +33,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct iov_iter *to)
{
ssize_t rc;
- struct path *path;
+ const struct path *path;
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
@@ -53,7 +53,7 @@ struct ecryptfs_getdents_callback {
};
/* Inspired by generic filldir in fs/readdir.c */
-static int
+static bool
ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
int lower_namelen, loff_t offset, u64 ino, unsigned int d_type)
{
@@ -61,18 +61,19 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
container_of(ctx, struct ecryptfs_getdents_callback, ctx);
size_t name_size;
char *name;
- int rc;
+ int err;
+ bool res;
buf->filldir_called++;
- rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
- buf->sb, lower_name,
- lower_namelen);
- if (rc) {
- if (rc != -EINVAL) {
+ err = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+ buf->sb, lower_name,
+ lower_namelen);
+ if (err) {
+ if (err != -EINVAL) {
ecryptfs_printk(KERN_DEBUG,
"%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n",
- __func__, lower_name, rc);
- return rc;
+ __func__, lower_name, err);
+ return false;
}
/* Mask -EINVAL errors as these are most likely due a plaintext
@@ -81,16 +82,15 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
* the "lost+found" dentry in the root directory of an Ext4
* filesystem.
*/
- return 0;
+ return true;
}
buf->caller->pos = buf->ctx.pos;
- rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
+ res = dir_emit(buf->caller, name, name_size, ino, d_type);
kfree(name);
- if (!rc)
+ if (res)
buf->entries_written++;
-
- return rc;
+ return res;
}
/**
@@ -111,14 +111,8 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
lower_file = ecryptfs_file_to_lower(file);
rc = iterate_dir(lower_file, &buf.ctx);
ctx->pos = buf.ctx.pos;
- if (rc < 0)
- goto out;
- if (buf.filldir_called && !buf.entries_written)
- goto out;
- if (rc >= 0)
- fsstack_copy_attr_atime(inode,
- file_inode(lower_file));
-out:
+ if (rc >= 0 && (buf.entries_written || !buf.filldir_called))
+ fsstack_copy_attr_atime(inode, file_inode(lower_file));
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 16d50dface59..c214fe0981bd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -317,7 +317,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
struct inode *inode, *lower_inode;
struct ecryptfs_dentry_info *dentry_info;
int rc = 0;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 2dd23a82e0de..2dc927ba067f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -105,7 +105,7 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
struct file **lower_file)
{
const struct cred *cred = current_cred();
- struct path *path = ecryptfs_dentry_to_lower_path(dentry);
+ const struct path *path = ecryptfs_dentry_to_lower_path(dentry);
int rc;
rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 9ad61b582f07..19af229eb7ca 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -170,16 +170,17 @@ out:
}
/**
- * ecryptfs_readpage
+ * ecryptfs_read_folio
* @file: An eCryptfs file
- * @page: Page from eCryptfs inode mapping into which to stick the read data
+ * @folio: Folio from eCryptfs inode mapping into which to stick the read data
*
- * Read in a page, decrypting if necessary.
+ * Read in a folio, decrypting if necessary.
*
* Returns zero on success; non-zero on error.
*/
-static int ecryptfs_readpage(struct file *file, struct page *page)
+static int ecryptfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
int rc = 0;
@@ -264,7 +265,7 @@ out:
*/
static int ecryptfs_write_begin(struct file *file,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
@@ -272,7 +273,7 @@ static int ecryptfs_write_begin(struct file *file,
loff_t prev_page_end_size;
int rc = 0;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -549,7 +550,7 @@ const struct address_space_operations ecryptfs_aops = {
.invalidate_folio = block_invalidate_folio,
#endif
.writepage = ecryptfs_writepage,
- .readpage = ecryptfs_readpage,
+ .read_folio = ecryptfs_read_folio,
.write_begin = ecryptfs_write_begin,
.write_end = ecryptfs_write_end,
.bmap = ecryptfs_bmap,
diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile
index 0b1c5e63eb71..7bfc2f9754a8 100644
--- a/fs/efivarfs/Makefile
+++ b/fs/efivarfs/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_EFIVAR_FS) += efivarfs.o
-efivarfs-objs := inode.o file.o super.o
+efivarfs-objs := inode.o file.o super.o vars.o
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index 30ae44cb7453..8ebf3a6a8aa2 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -7,6 +7,46 @@
#define EFIVAR_FS_INTERNAL_H
#include <linux/list.h>
+#include <linux/efi.h>
+
+struct efi_variable {
+ efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)];
+ efi_guid_t VendorGuid;
+ unsigned long DataSize;
+ __u8 Data[1024];
+ efi_status_t Status;
+ __u32 Attributes;
+} __attribute__((packed));
+
+struct efivar_entry {
+ struct efi_variable var;
+ struct list_head list;
+ struct kobject kobj;
+};
+
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+ void *data, bool duplicates, struct list_head *head);
+
+int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void efivar_entry_remove(struct efivar_entry *entry);
+int efivar_entry_delete(struct efivar_entry *entry);
+
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data);
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data);
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+ unsigned long *size, void *data, bool *set);
+
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+ struct list_head *head, void *data);
+
+bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+ unsigned long data_size);
+bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
+ size_t len);
extern const struct file_operations efivarfs_file_operations;
extern const struct inode_operations efivarfs_dir_inode_operations;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 15880a68faad..6780fc81cc11 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -155,10 +155,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
goto fail_inode;
}
- efivar_entry_size(entry, &size);
- err = efivar_entry_add(entry, &efivarfs_list);
- if (err)
- goto fail_inode;
+ __efivar_entry_get(entry, NULL, &size, NULL);
+ __efivar_entry_add(entry, &efivarfs_list);
/* copied by the above to local storage in the dentry. */
kfree(name);
@@ -182,10 +180,7 @@ fail:
static int efivarfs_destroy(struct efivar_entry *entry, void *data)
{
- int err = efivar_entry_remove(entry);
-
- if (err)
- return err;
+ efivar_entry_remove(entry);
kfree(entry);
return 0;
}
@@ -221,7 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
if (err)
- __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+ efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
return err;
}
@@ -246,7 +241,7 @@ static void efivarfs_kill_sb(struct super_block *sb)
kill_litter_super(sb);
/* Remove all entries and destroy */
- __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+ efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
}
static struct file_system_type efivarfs_type = {
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
new file mode 100644
index 000000000000..a0ef63cfcecb
--- /dev/null
+++ b/fs/efivarfs/vars.c
@@ -0,0 +1,738 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Originally from efivars.c
+ *
+ * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
+ * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
+ */
+
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/efi.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/ucs2_string.h>
+
+#include "internal.h"
+
+MODULE_IMPORT_NS(EFIVAR);
+
+static bool
+validate_device_path(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ struct efi_generic_dev_path *node;
+ int offset = 0;
+
+ node = (struct efi_generic_dev_path *)buffer;
+
+ if (len < sizeof(*node))
+ return false;
+
+ while (offset <= len - sizeof(*node) &&
+ node->length >= sizeof(*node) &&
+ node->length <= len - offset) {
+ offset += node->length;
+
+ if ((node->type == EFI_DEV_END_PATH ||
+ node->type == EFI_DEV_END_PATH2) &&
+ node->sub_type == EFI_DEV_END_ENTIRE)
+ return true;
+
+ node = (struct efi_generic_dev_path *)(buffer + offset);
+ }
+
+ /*
+ * If we're here then either node->length pointed past the end
+ * of the buffer or we reached the end of the buffer without
+ * finding a device path end node.
+ */
+ return false;
+}
+
+static bool
+validate_boot_order(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ /* An array of 16-bit integers */
+ if ((len % 2) != 0)
+ return false;
+
+ return true;
+}
+
+static bool
+validate_load_option(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ u16 filepathlength;
+ int i, desclength = 0, namelen;
+
+ namelen = ucs2_strnlen(var_name, EFI_VAR_NAME_LEN);
+
+ /* Either "Boot" or "Driver" followed by four digits of hex */
+ for (i = match; i < match+4; i++) {
+ if (var_name[i] > 127 ||
+ hex_to_bin(var_name[i] & 0xff) < 0)
+ return true;
+ }
+
+ /* Reject it if there's 4 digits of hex and then further content */
+ if (namelen > match + 4)
+ return false;
+
+ /* A valid entry must be at least 8 bytes */
+ if (len < 8)
+ return false;
+
+ filepathlength = buffer[4] | buffer[5] << 8;
+
+ /*
+ * There's no stored length for the description, so it has to be
+ * found by hand
+ */
+ desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
+
+ /* Each boot entry must have a descriptor */
+ if (!desclength)
+ return false;
+
+ /*
+ * If the sum of the length of the description, the claimed filepath
+ * length and the original header are greater than the length of the
+ * variable, it's malformed
+ */
+ if ((desclength + filepathlength + 6) > len)
+ return false;
+
+ /*
+ * And, finally, check the filepath
+ */
+ return validate_device_path(var_name, match, buffer + desclength + 6,
+ filepathlength);
+}
+
+static bool
+validate_uint16(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ /* A single 16-bit integer */
+ if (len != 2)
+ return false;
+
+ return true;
+}
+
+static bool
+validate_ascii_string(efi_char16_t *var_name, int match, u8 *buffer,
+ unsigned long len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (buffer[i] > 127)
+ return false;
+
+ if (buffer[i] == 0)
+ return true;
+ }
+
+ return false;
+}
+
+struct variable_validate {
+ efi_guid_t vendor;
+ char *name;
+ bool (*validate)(efi_char16_t *var_name, int match, u8 *data,
+ unsigned long len);
+};
+
+/*
+ * This is the list of variables we need to validate, as well as the
+ * whitelist for what we think is safe not to default to immutable.
+ *
+ * If it has a validate() method that's not NULL, it'll go into the
+ * validation routine. If not, it is assumed valid, but still used for
+ * whitelisting.
+ *
+ * Note that it's sorted by {vendor,name}, but globbed names must come after
+ * any other name with the same prefix.
+ */
+static const struct variable_validate variable_validate[] = {
+ { EFI_GLOBAL_VARIABLE_GUID, "BootNext", validate_uint16 },
+ { EFI_GLOBAL_VARIABLE_GUID, "BootOrder", validate_boot_order },
+ { EFI_GLOBAL_VARIABLE_GUID, "Boot*", validate_load_option },
+ { EFI_GLOBAL_VARIABLE_GUID, "DriverOrder", validate_boot_order },
+ { EFI_GLOBAL_VARIABLE_GUID, "Driver*", validate_load_option },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConIn", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConInDev", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConOut", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ConOutDev", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ErrOut", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "ErrOutDev", validate_device_path },
+ { EFI_GLOBAL_VARIABLE_GUID, "Lang", validate_ascii_string },
+ { EFI_GLOBAL_VARIABLE_GUID, "OsIndications", NULL },
+ { EFI_GLOBAL_VARIABLE_GUID, "PlatformLang", validate_ascii_string },
+ { EFI_GLOBAL_VARIABLE_GUID, "Timeout", validate_uint16 },
+ { LINUX_EFI_CRASH_GUID, "*", NULL },
+ { NULL_GUID, "", NULL },
+};
+
+/*
+ * Check if @var_name matches the pattern given in @match_name.
+ *
+ * @var_name: an array of @len non-NUL characters.
+ * @match_name: a NUL-terminated pattern string, optionally ending in "*". A
+ * final "*" character matches any trailing characters @var_name,
+ * including the case when there are none left in @var_name.
+ * @match: on output, the number of non-wildcard characters in @match_name
+ * that @var_name matches, regardless of the return value.
+ * @return: whether @var_name fully matches @match_name.
+ */
+static bool
+variable_matches(const char *var_name, size_t len, const char *match_name,
+ int *match)
+{
+ for (*match = 0; ; (*match)++) {
+ char c = match_name[*match];
+
+ switch (c) {
+ case '*':
+ /* Wildcard in @match_name means we've matched. */
+ return true;
+
+ case '\0':
+ /* @match_name has ended. Has @var_name too? */
+ return (*match == len);
+
+ default:
+ /*
+ * We've reached a non-wildcard char in @match_name.
+ * Continue only if there's an identical character in
+ * @var_name.
+ */
+ if (*match < len && c == var_name[*match])
+ continue;
+ return false;
+ }
+ }
+}
+
+bool
+efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
+ unsigned long data_size)
+{
+ int i;
+ unsigned long utf8_size;
+ u8 *utf8_name;
+
+ utf8_size = ucs2_utf8size(var_name);
+ utf8_name = kmalloc(utf8_size + 1, GFP_KERNEL);
+ if (!utf8_name)
+ return false;
+
+ ucs2_as_utf8(utf8_name, var_name, utf8_size);
+ utf8_name[utf8_size] = '\0';
+
+ for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+ const char *name = variable_validate[i].name;
+ int match = 0;
+
+ if (efi_guidcmp(vendor, variable_validate[i].vendor))
+ continue;
+
+ if (variable_matches(utf8_name, utf8_size+1, name, &match)) {
+ if (variable_validate[i].validate == NULL)
+ break;
+ kfree(utf8_name);
+ return variable_validate[i].validate(var_name, match,
+ data, data_size);
+ }
+ }
+ kfree(utf8_name);
+ return true;
+}
+
+bool
+efivar_variable_is_removable(efi_guid_t vendor, const char *var_name,
+ size_t len)
+{
+ int i;
+ bool found = false;
+ int match = 0;
+
+ /*
+ * Check if our variable is in the validated variables list
+ */
+ for (i = 0; variable_validate[i].name[0] != '\0'; i++) {
+ if (efi_guidcmp(variable_validate[i].vendor, vendor))
+ continue;
+
+ if (variable_matches(var_name, len,
+ variable_validate[i].name, &match)) {
+ found = true;
+ break;
+ }
+ }
+
+ /*
+ * If it's in our list, it is removable.
+ */
+ return found;
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
+ struct list_head *head)
+{
+ struct efivar_entry *entry, *n;
+ unsigned long strsize1, strsize2;
+ bool found = false;
+
+ strsize1 = ucs2_strsize(variable_name, 1024);
+ list_for_each_entry_safe(entry, n, head, list) {
+ strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
+ if (strsize1 == strsize2 &&
+ !memcmp(variable_name, &(entry->var.VariableName),
+ strsize2) &&
+ !efi_guidcmp(entry->var.VendorGuid,
+ *vendor)) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+}
+
+/*
+ * Returns the size of variable_name, in bytes, including the
+ * terminating NULL character, or variable_name_size if no NULL
+ * character is found among the first variable_name_size bytes.
+ */
+static unsigned long var_name_strnsize(efi_char16_t *variable_name,
+ unsigned long variable_name_size)
+{
+ unsigned long len;
+ efi_char16_t c;
+
+ /*
+ * The variable name is, by definition, a NULL-terminated
+ * string, so make absolutely sure that variable_name_size is
+ * the value we expect it to be. If not, return the real size.
+ */
+ for (len = 2; len <= variable_name_size; len += sizeof(c)) {
+ c = variable_name[(len / sizeof(c)) - 1];
+ if (!c)
+ break;
+ }
+
+ return min(len, variable_name_size);
+}
+
+/*
+ * Print a warning when duplicate EFI variables are encountered and
+ * disable the sysfs workqueue since the firmware is buggy.
+ */
+static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
+ unsigned long len16)
+{
+ size_t i, len8 = len16 / sizeof(efi_char16_t);
+ char *str8;
+
+ str8 = kzalloc(len8, GFP_KERNEL);
+ if (!str8)
+ return;
+
+ for (i = 0; i < len8; i++)
+ str8[i] = str16[i];
+
+ printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n",
+ str8, vendor_guid);
+ kfree(str8);
+}
+
+/**
+ * efivar_init - build the initial list of EFI variables
+ * @func: callback function to invoke for every variable
+ * @data: function-specific data to pass to @func
+ * @duplicates: error if we encounter duplicates on @head?
+ * @head: initialised head of variable list
+ *
+ * Get every EFI variable from the firmware and invoke @func. @func
+ * should call efivar_entry_add() to build the list of variables.
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+ void *data, bool duplicates, struct list_head *head)
+{
+ unsigned long variable_name_size = 1024;
+ efi_char16_t *variable_name;
+ efi_status_t status;
+ efi_guid_t vendor_guid;
+ int err = 0;
+
+ variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+ if (!variable_name) {
+ printk(KERN_ERR "efivars: Memory allocation failed.\n");
+ return -ENOMEM;
+ }
+
+ err = efivar_lock();
+ if (err)
+ goto free;
+
+ /*
+ * Per EFI spec, the maximum storage allocated for both
+ * the variable name and variable data is 1024 bytes.
+ */
+
+ do {
+ variable_name_size = 1024;
+
+ status = efivar_get_next_variable(&variable_name_size,
+ variable_name,
+ &vendor_guid);
+ switch (status) {
+ case EFI_SUCCESS:
+ variable_name_size = var_name_strnsize(variable_name,
+ variable_name_size);
+
+ /*
+ * Some firmware implementations return the
+ * same variable name on multiple calls to
+ * get_next_variable(). Terminate the loop
+ * immediately as there is no guarantee that
+ * we'll ever see a different variable name,
+ * and may end up looping here forever.
+ */
+ if (duplicates &&
+ variable_is_present(variable_name, &vendor_guid,
+ head)) {
+ dup_variable_bug(variable_name, &vendor_guid,
+ variable_name_size);
+ status = EFI_NOT_FOUND;
+ } else {
+ err = func(variable_name, vendor_guid,
+ variable_name_size, data);
+ if (err)
+ status = EFI_NOT_FOUND;
+ }
+ break;
+ case EFI_UNSUPPORTED:
+ err = -EOPNOTSUPP;
+ status = EFI_NOT_FOUND;
+ break;
+ case EFI_NOT_FOUND:
+ break;
+ default:
+ printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
+ status);
+ status = EFI_NOT_FOUND;
+ break;
+ }
+
+ } while (status != EFI_NOT_FOUND);
+
+ efivar_unlock();
+free:
+ kfree(variable_name);
+
+ return err;
+}
+
+/**
+ * efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+ int err;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+ list_add(&entry->list, head);
+ efivar_unlock();
+
+ return 0;
+}
+
+/**
+ * __efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ */
+void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+ list_add(&entry->list, head);
+}
+
+/**
+ * efivar_entry_remove - remove entry from variable list
+ * @entry: entry to remove from list
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+void efivar_entry_remove(struct efivar_entry *entry)
+{
+ list_del(&entry->list);
+}
+
+/*
+ * efivar_entry_list_del_unlock - remove entry from variable list
+ * @entry: entry to remove
+ *
+ * Remove @entry from the variable list and release the list lock.
+ *
+ * NOTE: slightly weird locking semantics here - we expect to be
+ * called with the efivars lock already held, and we release it before
+ * returning. This is because this function is usually called after
+ * set_variable() while the lock is still held.
+ */
+static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
+{
+ list_del(&entry->list);
+ efivar_unlock();
+}
+
+/**
+ * efivar_entry_delete - delete variable and remove entry from list
+ * @entry: entry containing variable to delete
+ *
+ * Delete the variable from the firmware and remove @entry from the
+ * variable list. It is the caller's responsibility to free @entry
+ * once we return.
+ *
+ * Returns 0 on success, -EINTR if we can't grab the semaphore,
+ * converted EFI status code if set_variable() fails.
+ */
+int efivar_entry_delete(struct efivar_entry *entry)
+{
+ efi_status_t status;
+ int err;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ status = efivar_set_variable_locked(entry->var.VariableName,
+ &entry->var.VendorGuid,
+ 0, 0, NULL, false);
+ if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
+ efivar_unlock();
+ return efi_status_to_err(status);
+ }
+
+ efivar_entry_list_del_unlock(entry);
+ return 0;
+}
+
+/**
+ * efivar_entry_size - obtain the size of a variable
+ * @entry: entry for this variable
+ * @size: location to store the variable's size
+ */
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+{
+ efi_status_t status;
+ int err;
+
+ *size = 0;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ status = efivar_get_variable(entry->var.VariableName,
+ &entry->var.VendorGuid, NULL, size, NULL);
+ efivar_unlock();
+
+ if (status != EFI_BUFFER_TOO_SMALL)
+ return efi_status_to_err(status);
+
+ return 0;
+}
+
+/**
+ * __efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
+ */
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data)
+{
+ efi_status_t status;
+
+ status = efivar_get_variable(entry->var.VariableName,
+ &entry->var.VendorGuid,
+ attributes, size, data);
+
+ return efi_status_to_err(status);
+}
+
+/**
+ * efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ */
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+ unsigned long *size, void *data)
+{
+ int err;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+ err = __efivar_entry_get(entry, attributes, size, data);
+ efivar_unlock();
+
+ return 0;
+}
+
+/**
+ * efivar_entry_set_get_size - call set_variable() and get new size (atomic)
+ * @entry: entry containing variable to set and get
+ * @attributes: attributes of variable to be written
+ * @size: size of data buffer
+ * @data: buffer containing data to write
+ * @set: did the set_variable() call succeed?
+ *
+ * This is a pretty special (complex) function. See efivarfs_file_write().
+ *
+ * Atomically call set_variable() for @entry and if the call is
+ * successful, return the new size of the variable from get_variable()
+ * in @size. The success of set_variable() is indicated by @set.
+ *
+ * Returns 0 on success, -EINVAL if the variable data is invalid,
+ * -ENOSPC if the firmware does not have enough available space, or a
+ * converted EFI status code if either of set_variable() or
+ * get_variable() fail.
+ *
+ * If the EFI variable does not exist when calling set_variable()
+ * (EFI_NOT_FOUND), @entry is removed from the variable list.
+ */
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+ unsigned long *size, void *data, bool *set)
+{
+ efi_char16_t *name = entry->var.VariableName;
+ efi_guid_t *vendor = &entry->var.VendorGuid;
+ efi_status_t status;
+ int err;
+
+ *set = false;
+
+ if (efivar_validate(*vendor, name, data, *size) == false)
+ return -EINVAL;
+
+ /*
+ * The lock here protects the get_variable call, the conditional
+ * set_variable call, and removal of the variable from the efivars
+ * list (in the case of an authenticated delete).
+ */
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ /*
+ * Ensure that the available space hasn't shrunk below the safe level
+ */
+ status = check_var_size(attributes, *size + ucs2_strsize(name, 1024));
+ if (status != EFI_SUCCESS) {
+ if (status != EFI_UNSUPPORTED) {
+ err = efi_status_to_err(status);
+ goto out;
+ }
+
+ if (*size > 65536) {
+ err = -ENOSPC;
+ goto out;
+ }
+ }
+
+ status = efivar_set_variable_locked(name, vendor, attributes, *size,
+ data, false);
+ if (status != EFI_SUCCESS) {
+ err = efi_status_to_err(status);
+ goto out;
+ }
+
+ *set = true;
+
+ /*
+ * Writing to the variable may have caused a change in size (which
+ * could either be an append or an overwrite), or the variable to be
+ * deleted. Perform a GetVariable() so we can tell what actually
+ * happened.
+ */
+ *size = 0;
+ status = efivar_get_variable(entry->var.VariableName,
+ &entry->var.VendorGuid,
+ NULL, size, NULL);
+
+ if (status == EFI_NOT_FOUND)
+ efivar_entry_list_del_unlock(entry);
+ else
+ efivar_unlock();
+
+ if (status && status != EFI_BUFFER_TOO_SMALL)
+ return efi_status_to_err(status);
+
+ return 0;
+
+out:
+ efivar_unlock();
+ return err;
+
+}
+
+/**
+ * efivar_entry_iter - iterate over variable list
+ * @func: callback function
+ * @head: head of variable list
+ * @data: function-specific data to pass to callback
+ *
+ * Iterate over the list of EFI variables and call @func with every
+ * entry on the list. It is safe for @func to remove entries in the
+ * list via efivar_entry_delete() while iterating.
+ *
+ * Some notes for the callback function:
+ * - a non-zero return value indicates an error and terminates the loop
+ * - @func is called from atomic context
+ */
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+ struct list_head *head, void *data)
+{
+ struct efivar_entry *entry, *n;
+ int err = 0;
+
+ err = efivar_lock();
+ if (err)
+ return err;
+
+ list_for_each_entry_safe(entry, n, head, list) {
+ err = func(entry, data);
+ if (err)
+ break;
+ }
+ efivar_unlock();
+
+ return err;
+}
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 89e73a6f0d36..3ba94bb005a6 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -14,16 +14,18 @@
#include "efs.h"
#include <linux/efs_fs_sb.h>
-static int efs_readpage(struct file *file, struct page *page)
+static int efs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,efs_get_block);
+ return block_read_full_folio(folio, efs_get_block);
}
+
static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,efs_get_block);
}
+
static const struct address_space_operations efs_aops = {
- .readpage = efs_readpage,
+ .read_folio = efs_read_folio,
.bmap = _efs_bmap
};
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 923eb91654d5..3b03a573cb1a 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -12,8 +12,9 @@
#include <linux/buffer_head.h>
#include "efs.h"
-static int efs_symlink_readpage(struct file *file, struct page *page)
+static int efs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
char *link = page_address(page);
struct buffer_head * bh;
struct inode * inode = page->mapping->host;
@@ -49,5 +50,5 @@ fail:
}
const struct address_space_operations efs_symlink_aops = {
- .readpage = efs_symlink_readpage
+ .read_folio = efs_symlink_read_folio
};
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f57255ab88ed..85490370e0ca 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -98,3 +98,13 @@ config EROFS_FS_ZIP_LZMA
systems will be readable without selecting this option.
If unsure, say N.
+
+config EROFS_FS_ONDEMAND
+ bool "EROFS fscache-based on-demand read support"
+ depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
+ default n
+ help
+ This permits EROFS to use fscache-backed data blobs with on-demand
+ read support.
+
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 8a3317e38e5a..99bbc597a3e9 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,3 +5,4 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 19e6c56a9f47..26fa170090b8 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -17,7 +17,7 @@ struct z_erofs_decompress_req {
/* indicate the algorithm will be used for decompression */
unsigned int alg;
- bool inplace_io, partial_decoding;
+ bool inplace_io, partial_decoding, fillgaps;
};
struct z_erofs_decompressor {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 780db1e5f4b7..fe8ac0e163f7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -6,6 +6,7 @@
*/
#include "internal.h"
#include <linux/prefetch.h>
+#include <linux/sched/mm.h>
#include <linux/dax.h>
#include <trace/events/erofs.h>
@@ -35,14 +36,20 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
erofs_off_t offset = blknr_to_addr(blkaddr);
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
+ struct folio *folio;
+ unsigned int nofs_flag;
if (!page || page->index != index) {
erofs_put_metabuf(buf);
- page = read_cache_page_gfp(mapping, index,
- mapping_gfp_constraint(mapping, ~__GFP_FS));
- if (IS_ERR(page))
- return page;
+
+ nofs_flag = memalloc_nofs_save();
+ folio = read_cache_folio(mapping, index, NULL, NULL);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(folio))
+ return folio;
+
/* should already be PageUptodate, no need to lock page */
+ page = folio_file_page(folio, index);
buf->page = page;
}
if (buf->kmap_type == EROFS_NO_KMAP) {
@@ -63,6 +70,10 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
+ if (erofs_is_fscache_mode(sb))
+ return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode,
+ blkaddr, type);
+
return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
}
@@ -110,8 +121,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
return 0;
}
-static int erofs_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map, int flags)
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
{
struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
@@ -199,6 +210,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = sb->s_bdev;
map->m_daxdev = EROFS_SB(sb)->dax_dev;
map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
+ map->m_fscache = EROFS_SB(sb)->s_fscache;
if (map->m_deviceid) {
down_read(&devs->rwsem);
@@ -210,6 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
up_read(&devs->rwsem);
} else if (devs->extra_devices) {
down_read(&devs->rwsem);
@@ -227,6 +240,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
break;
}
}
@@ -337,9 +351,9 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* since we dont have write or truncate flows, so no inode
* locking needs to be held at the moment.
*/
-static int erofs_readpage(struct file *file, struct page *page)
+static int erofs_read_folio(struct file *file, struct folio *folio)
{
- return iomap_readpage(page, &erofs_iomap_ops);
+ return iomap_read_folio(folio, &erofs_iomap_ops);
}
static void erofs_readahead(struct readahead_control *rac)
@@ -352,49 +366,40 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
return iomap_bmap(mapping, block, &erofs_iomap_ops);
}
-static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
- loff_t align = iocb->ki_pos | iov_iter_count(to) |
- iov_iter_alignment(to);
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int blksize_mask;
-
- if (bdev)
- blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1;
- else
- blksize_mask = (1 << inode->i_blkbits) - 1;
-
- if (align & blksize_mask)
- return -EINVAL;
- return 0;
-}
-static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
/* no need taking (shared) inode lock since it's a ro filesystem */
if (!iov_iter_count(to))
return 0;
#ifdef CONFIG_FS_DAX
- if (IS_DAX(iocb->ki_filp->f_mapping->host))
+ if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
if (iocb->ki_flags & IOCB_DIRECT) {
- int err = erofs_prepare_dio(iocb, to);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ unsigned int blksize_mask;
+
+ if (bdev)
+ blksize_mask = bdev_logical_block_size(bdev) - 1;
+ else
+ blksize_mask = (1 << inode->i_blkbits) - 1;
+
+ if ((iocb->ki_pos | iov_iter_count(to) |
+ iov_iter_alignment(to)) & blksize_mask)
+ return -EINVAL;
- if (!err)
- return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
- NULL, 0, 0);
- if (err < 0)
- return err;
+ return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+ NULL, 0, NULL, 0);
}
return filemap_read(iocb, to, 0);
}
/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_raw_access_aops = {
- .readpage = erofs_readpage,
+ .read_folio = erofs_read_folio,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 3efa686c7644..51b7ac7166d9 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -46,8 +46,6 @@ int z_erofs_load_lz4_config(struct super_block *sb,
erofs_err(sb, "too large lz4 pclusterblks %u",
sbi->lz4.max_pclusterblks);
return -EINVAL;
- } else if (sbi->lz4.max_pclusterblks >= 2) {
- erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
}
} else {
distance = le16_to_cpu(dsb->u1.lz4_max_distance);
@@ -85,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
j = 0;
/* 'valid' bounced can only be tested after a complete round */
- if (test_bit(j, bounced)) {
+ if (!rq->fillgaps && test_bit(j, bounced)) {
DBG_BUGON(i < lz4_max_distance_pages);
DBG_BUGON(top >= lz4_max_distance_pages);
availables[top++] = rq->out[i - lz4_max_distance_pages];
@@ -93,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
if (page) {
__clear_bit(j, bounced);
- if (kaddr) {
- if (kaddr + PAGE_SIZE == page_address(page))
+ if (!PageHighMem(page)) {
+ if (!i) {
+ kaddr = page_address(page);
+ continue;
+ }
+ if (kaddr &&
+ kaddr + PAGE_SIZE == page_address(page)) {
kaddr += PAGE_SIZE;
- else
- kaddr = NULL;
- } else if (!i) {
- kaddr = page_address(page);
+ continue;
+ }
}
+ kaddr = NULL;
continue;
}
kaddr = NULL;
@@ -315,51 +317,61 @@ dstmap_out:
return ret;
}
-static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
- const unsigned int nrpages_out =
+ const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ const unsigned int outpages =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
PAGE_SIZE - rq->pageofs_out);
+ const unsigned int lefthalf = rq->outputsize - righthalf;
+ const unsigned int interlaced_offset =
+ rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
unsigned char *src, *dst;
- if (nrpages_out > 2) {
+ if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
DBG_BUGON(1);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (rq->out[0] == *rq->in) {
- DBG_BUGON(nrpages_out != 1);
+ DBG_BUGON(rq->pageofs_out);
return 0;
}
- src = kmap_atomic(*rq->in) + rq->pageofs_in;
+ src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
if (rq->out[0]) {
- dst = kmap_atomic(rq->out[0]);
- memcpy(dst + rq->pageofs_out, src, righthalf);
- kunmap_atomic(dst);
+ dst = kmap_local_page(rq->out[0]);
+ memcpy(dst + rq->pageofs_out, src + interlaced_offset,
+ righthalf);
+ kunmap_local(dst);
}
- if (nrpages_out == 2) {
- DBG_BUGON(!rq->out[1]);
- if (rq->out[1] == *rq->in) {
- memmove(src, src + righthalf, rq->pageofs_out);
- } else {
- dst = kmap_atomic(rq->out[1]);
- memcpy(dst, src + righthalf, rq->pageofs_out);
- kunmap_atomic(dst);
+ if (outpages > inpages) {
+ DBG_BUGON(!rq->out[outpages - 1]);
+ if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
+ dst = kmap_local_page(rq->out[outpages - 1]);
+ memcpy(dst, interlaced_offset ? src :
+ (src + righthalf), lefthalf);
+ kunmap_local(dst);
+ } else if (!interlaced_offset) {
+ memmove(src, src + righthalf, lefthalf);
}
}
- kunmap_atomic(src);
+ kunmap_local(src);
return 0;
}
static struct z_erofs_decompressor decompressors[] = {
[Z_EROFS_COMPRESSION_SHIFTED] = {
- .decompress = z_erofs_shifted_transform,
+ .decompress = z_erofs_transform_plain,
.name = "shifted"
},
+ [Z_EROFS_COMPRESSION_INTERLACED] = {
+ .decompress = z_erofs_transform_plain,
+ .name = "interlaced"
+ },
[Z_EROFS_COMPRESSION_LZ4] = {
.decompress = z_erofs_lz4_decompress,
.name = "lz4"
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 05a3063cf2bc..091fd5adf818 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -143,6 +143,7 @@ again:
DBG_BUGON(z_erofs_lzma_head);
z_erofs_lzma_head = head;
spin_unlock(&z_erofs_lzma_lock);
+ wake_up_all(&z_erofs_lzma_wq);
z_erofs_lzma_max_dictsize = dict_size;
mutex_unlock(&lzma_resize_mutex);
@@ -216,6 +217,9 @@ again:
strm->buf.out_size = min_t(u32, outlen,
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
+ if (!rq->out[no] && rq->fillgaps) /* deduped */
+ rq->out[no] = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
if (rq->out[no])
strm->buf.out = kmap(rq->out[no]) + pageofs;
pageofs = 0;
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 18e59821c597..ecf28f66b97d 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name,
}
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
- void *dentry_blk, unsigned int *ofs,
+ void *dentry_blk, struct erofs_dirent *de,
unsigned int nameoff, unsigned int maxsize)
{
- struct erofs_dirent *de = dentry_blk + *ofs;
const struct erofs_dirent *end = dentry_blk + nameoff;
while (de < end) {
@@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
/* stopped by some reason */
return 1;
++de;
- *ofs += sizeof(struct erofs_dirent);
+ ctx->pos += sizeof(struct erofs_dirent);
}
- *ofs = maxsize;
return 0;
}
@@ -90,33 +88,33 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
nameoff = le16_to_cpu(de->nameoff);
if (nameoff < sizeof(struct erofs_dirent) ||
- nameoff >= PAGE_SIZE) {
+ nameoff >= EROFS_BLKSIZ) {
erofs_err(dir->i_sb,
"invalid de[0].nameoff %u @ nid %llu",
nameoff, EROFS_I(dir)->nid);
err = -EFSCORRUPTED;
- goto skip_this;
+ break;
}
maxsize = min_t(unsigned int,
- dirsize - ctx->pos + ofs, PAGE_SIZE);
+ dirsize - ctx->pos + ofs, EROFS_BLKSIZ);
/* search dirents at the arbitrary position */
if (initial) {
initial = false;
ofs = roundup(ofs, sizeof(struct erofs_dirent));
+ ctx->pos = blknr_to_addr(i) + ofs;
if (ofs >= nameoff)
goto skip_this;
}
- err = erofs_fill_dentries(dir, ctx, de, &ofs,
+ err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
nameoff, maxsize);
-skip_this:
- ctx->pos = blknr_to_addr(i) + ofs;
-
if (err)
break;
+skip_this:
+ ctx->pos = blknr_to_addr(i) + maxsize;
++i;
ofs = 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 1238ca104f09..dbcd24371002 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -25,6 +25,8 @@
#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010
+#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
+#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
@@ -32,17 +34,16 @@
EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
- EROFS_FEATURE_INCOMPAT_ZTAILPACKING)
+ EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
+ EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
+ EROFS_FEATURE_INCOMPAT_DEDUPE)
#define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot {
- union {
- u8 uuid[16]; /* used for device manager later */
- u8 userdata[64]; /* digest(sha256), etc. */
- } u;
- __le32 blocks; /* total fs blocks of this device */
- __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 tag[64]; /* digest(sha256), etc. */
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
u8 reserved[56];
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
@@ -58,8 +59,8 @@ struct erofs_super_block {
__le16 root_nid; /* nid of root directory */
__le64 inos; /* total valid ino # (== f_files - f_favail) */
- __le64 build_time; /* inode v1 time derivation */
- __le32 build_time_nsec; /* inode v1 time derivation in nano scale */
+ __le64 build_time; /* compact inode time derivation */
+ __le32 build_time_nsec; /* compact inode time derivation in ns scale */
__le32 blocks; /* used for statfs */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
@@ -74,20 +75,22 @@ struct erofs_super_block {
} __packed u1;
__le16 extra_devices; /* # of devices besides the primary device */
__le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
- __u8 reserved2[38];
+ __u8 reserved[6];
+ __le64 packed_nid; /* nid of the special packed inode */
+ __u8 reserved2[24];
};
/*
* erofs inode datalayout (i_format in on-disk inode):
- * 0 - inode plain without inline data A:
+ * 0 - uncompressed flat inode without tail-packing inline data:
* inode, [xattrs], ... | ... | no-holed data
- * 1 - inode VLE compression B (legacy):
- * inode, [xattrs], extents ... | ...
- * 2 - inode plain with inline data C:
- * inode, [xattrs], last_inline_data, ... | ... | no-holed data
- * 3 - inode compression D:
+ * 1 - compressed inode with non-compact indexes:
+ * inode, [xattrs], [map_header], extents ... | ...
+ * 2 - uncompressed flat inode with tail-packing inline data:
+ * inode, [xattrs], tailpacking data, ... | ... | no-holed data
+ * 3 - compressed inode with compact indexes:
* inode, [xattrs], map_header, extents ... | ...
- * 4 - inode chunk-based E:
+ * 4 - chunk-based inode with (optional) multi-device support:
* inode, [xattrs], chunk indexes ... | ...
* 5~7 - reserved
*/
@@ -106,7 +109,7 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
}
-/* bit definitions of inode i_advise */
+/* bit definitions of inode i_format */
#define EROFS_I_VERSION_BITS 1
#define EROFS_I_DATALAYOUT_BITS 3
@@ -140,8 +143,9 @@ struct erofs_inode_compact {
__le32 i_size;
__le32 i_reserved;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
@@ -156,9 +160,9 @@ struct erofs_inode_compact {
__le32 i_reserved2;
};
-/* 32 bytes on-disk inode */
+/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0
-/* 64 bytes on-disk inode */
+/* 64-byte on-disk inode */
#define EROFS_INODE_LAYOUT_EXTENDED 1
/* 64-byte complete form of an ondisk inode */
@@ -171,8 +175,9 @@ struct erofs_inode_extended {
__le16 i_reserved;
__le64 i_size;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
@@ -296,16 +301,27 @@ struct z_erofs_lzma_cfgs {
* bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
* bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
* bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
+ * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
+ * bit 5 : fragment pcluster (0 - off; 1 - on)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
+#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
+#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+#define Z_EROFS_FRAGMENT_INODE_BIT 7
struct z_erofs_map_header {
- __le16 h_reserved1;
- /* indicates the encoded size of tailpacking data */
- __le16 h_idata_size;
+ union {
+ /* fragment data offset in the packed inode */
+ __le32 h_fragmentoff;
+ struct {
+ __le16 h_reserved1;
+ /* indicates the encoded size of tailpacking data */
+ __le16 h_idata_size;
+ };
+ };
__le16 h_advise;
/*
* bit 0-3 : algorithm type of head 1 (logical cluster type 01);
@@ -314,7 +330,8 @@ struct z_erofs_map_header {
__u8 h_algorithmtype;
/*
* bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-7 : reserved.
+ * bit 3-6 : reserved;
+ * bit 7 : move the whole file into packed inode or not.
*/
__u8 h_clusterbits;
};
@@ -356,6 +373,9 @@ enum {
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
+/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
+#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15)
+
/*
* D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
* compressed block count of a compressed extent (in logical clusters, aka.
@@ -365,17 +385,16 @@ enum {
struct z_erofs_vle_decompressed_index {
__le16 di_advise;
- /* where to decompress in the head cluster */
+ /* where to decompress in the head lcluster */
__le16 di_clusterofs;
union {
- /* for the head cluster */
+ /* for the HEAD lclusters */
__le32 blkaddr;
/*
- * for the rest clusters
- * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
- * [0] - pointing to the head cluster
- * [1] - pointing to the tail cluster
+ * for the NONHEAD lclusters
+ * [0] - distance to its HEAD lcluster
+ * [1] - distance to the next HEAD lcluster
*/
__le16 delta[2];
} di_u;
@@ -404,6 +423,10 @@ struct erofs_dirent {
/* check the EROFS on-disk layout strictly at compile time */
static inline void erofs_check_ondisk_layout_definitions(void)
{
+ const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) {
+ .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
+ };
+
BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
@@ -421,6 +444,9 @@ static inline void erofs_check_ondisk_layout_definitions(void)
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+ /* exclude old compiler versions like gcc 7.5.0 */
+ BUILD_BUG_ON(__builtin_constant_p(fmh) ?
+ fmh != cpu_to_le64(1ULL << 63) : 0);
}
#endif
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
new file mode 100644
index 000000000000..998cd26a1b3b
--- /dev/null
+++ b/fs/erofs/fscache.c
@@ -0,0 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022, Alibaba Cloud
+ * Copyright (C) 2022, Bytedance Inc. All rights reserved.
+ */
+#include <linux/fscache.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(erofs_domain_list_lock);
+static DEFINE_MUTEX(erofs_domain_cookies_lock);
+static LIST_HEAD(erofs_domain_list);
+static struct vfsmount *erofs_pseudo_mnt;
+
+static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct netfs_io_request *rreq;
+
+ rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ if (!rreq)
+ return ERR_PTR(-ENOMEM);
+
+ rreq->start = start;
+ rreq->len = len;
+ rreq->mapping = mapping;
+ rreq->inode = mapping->host;
+ INIT_LIST_HEAD(&rreq->subrequests);
+ refcount_set(&rreq->ref, 1);
+ return rreq;
+}
+
+static void erofs_fscache_put_request(struct netfs_io_request *rreq)
+{
+ if (!refcount_dec_and_test(&rreq->ref))
+ return;
+ if (rreq->cache_resources.ops)
+ rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+ kfree(rreq);
+}
+
+static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
+{
+ if (!refcount_dec_and_test(&subreq->ref))
+ return;
+ erofs_fscache_put_request(subreq->rreq);
+ kfree(subreq);
+}
+
+static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ while (!list_empty(&rreq->subrequests)) {
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ erofs_fscache_put_subrequest(subreq);
+ }
+}
+
+static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ unsigned int iopos = 0;
+ pgoff_t start_page = rreq->start / PAGE_SIZE;
+ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ bool subreq_failed = false;
+
+ XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ subreq_failed = (subreq->error < 0);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_page) {
+ unsigned int pgpos =
+ (folio_index(folio) - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + folio_size(folio);
+ bool pg_failed = false;
+
+ for (;;) {
+ if (!subreq) {
+ pg_failed = true;
+ break;
+ }
+
+ pg_failed |= subreq_failed;
+ if (pgend < iopos + subreq->len)
+ break;
+
+ iopos += subreq->len;
+ if (!list_is_last(&subreq->rreq_link,
+ &rreq->subrequests)) {
+ subreq = list_next_entry(subreq, rreq_link);
+ subreq_failed = (subreq->error < 0);
+ } else {
+ subreq = NULL;
+ subreq_failed = false;
+ }
+ if (pgend == iopos)
+ break;
+ }
+
+ if (!pg_failed)
+ folio_mark_uptodate(folio);
+
+ folio_unlock(folio);
+ }
+ rcu_read_unlock();
+}
+
+static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+{
+ erofs_fscache_rreq_unlock_folios(rreq);
+ erofs_fscache_clear_subrequests(rreq);
+ erofs_fscache_put_request(rreq);
+}
+
+static void erofc_fscache_subreq_complete(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ subreq->error = transferred_or_error;
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ erofs_fscache_put_subrequest(subreq);
+}
+
+/*
+ * Read data from fscache and fill the read data into page cache described by
+ * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
+ * the start physical address in the cache file.
+ */
+static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
+ struct netfs_io_request *rreq, loff_t pstart)
+{
+ enum netfs_io_source source;
+ struct super_block *sb = rreq->mapping->host->i_sb;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct iov_iter iter;
+ loff_t start = rreq->start;
+ size_t len = rreq->len;
+ size_t done = 0;
+ int ret;
+
+ atomic_set(&rreq->nr_outstanding, 1);
+
+ ret = fscache_begin_read_operation(cres, cookie);
+ if (ret)
+ goto out;
+
+ while (done < len) {
+ subreq = kzalloc(sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
+ if (subreq) {
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ refcount_inc(&rreq->ref);
+ } else {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ subreq->start = pstart + done;
+ subreq->len = len - done;
+ subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
+
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+ source = cres->ops->prepare_read(subreq, LLONG_MAX);
+ if (WARN_ON(subreq->len == 0))
+ source = NETFS_INVALID_READ;
+ if (source != NETFS_READ_FROM_CACHE) {
+ erofs_err(sb, "failed to fscache prepare_read (source %d)",
+ source);
+ ret = -EIO;
+ subreq->error = ret;
+ erofs_fscache_put_subrequest(subreq);
+ goto out;
+ }
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ start + done, subreq->len);
+
+ ret = fscache_read(cres, subreq->start, &iter,
+ NETFS_READ_HOLE_FAIL,
+ erofc_fscache_subreq_complete, subreq);
+ if (ret == -EIOCBQUEUED)
+ ret = 0;
+ if (ret) {
+ erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ goto out;
+ }
+
+ done += subreq->len;
+ }
+out:
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ return ret;
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
+ int ret;
+ struct super_block *sb = folio_mapping(folio)->host->i_sb;
+ struct netfs_io_request *rreq;
+ struct erofs_map_dev mdev = {
+ .m_deviceid = 0,
+ .m_pa = folio_pos(folio),
+ };
+
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ goto out;
+
+ rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto out;
+ }
+
+ return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa);
+out:
+ folio_unlock(folio);
+ return ret;
+}
+
+/*
+ * Read into page cache in the range described by (@pos, @len).
+ *
+ * On return, the caller is responsible for page unlocking if the output @unlock
+ * is true, or the callee will take this responsibility through netfs_io_request
+ * interface.
+ *
+ * The return value is the number of bytes successfully handled, or negative
+ * error code on failure. The only exception is that, the length of the range
+ * instead of the error code is returned on failure after netfs_io_request is
+ * allocated, so that .readahead() could advance rac accordingly.
+ */
+static int erofs_fscache_data_read(struct address_space *mapping,
+ loff_t pos, size_t len, bool *unlock)
+{
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct netfs_io_request *rreq;
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+ struct iov_iter iter;
+ size_t count;
+ int ret;
+
+ *unlock = true;
+
+ map.m_la = pos;
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret)
+ return ret;
+
+ if (map.m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_blk_t blknr;
+ size_t offset, size;
+ void *src;
+
+ /* For tail packing layout, the offset may be non-zero. */
+ offset = erofs_blkoff(map.m_pa);
+ blknr = erofs_blknr(map.m_pa);
+ size = map.m_llen;
+
+ src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE);
+ if (copy_to_iter(src + offset, size, &iter) != size)
+ return -EFAULT;
+ iov_iter_zero(PAGE_SIZE - size, &iter);
+ erofs_put_metabuf(&buf);
+ return PAGE_SIZE;
+ }
+
+ count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
+ DBG_BUGON(!count || count % PAGE_SIZE);
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
+ iov_iter_zero(count, &iter);
+ return count;
+ }
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ rreq = erofs_fscache_alloc_request(mapping, pos, count);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ *unlock = false;
+ erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa + (pos - map.m_la));
+ return count;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+ bool unlock;
+ int ret;
+
+ DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+
+ ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
+ folio_size(folio), &unlock);
+ if (unlock) {
+ if (ret > 0)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ return ret < 0 ? ret : 0;
+}
+
+static void erofs_fscache_readahead(struct readahead_control *rac)
+{
+ struct folio *folio;
+ size_t len, done = 0;
+ loff_t start, pos;
+ bool unlock;
+ int ret, size;
+
+ if (!readahead_count(rac))
+ return;
+
+ start = readahead_pos(rac);
+ len = readahead_length(rac);
+
+ do {
+ pos = start + done;
+ ret = erofs_fscache_data_read(rac->mapping, pos,
+ len - done, &unlock);
+ if (ret <= 0)
+ return;
+
+ size = ret;
+ while (size) {
+ folio = readahead_folio(rac);
+ size -= folio_size(folio);
+ if (unlock) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ }
+ } while ((done += ret) < len);
+}
+
+static const struct address_space_operations erofs_fscache_meta_aops = {
+ .read_folio = erofs_fscache_meta_read_folio,
+};
+
+const struct address_space_operations erofs_fscache_access_aops = {
+ .read_folio = erofs_fscache_read_folio,
+ .readahead = erofs_fscache_readahead,
+};
+
+static void erofs_fscache_domain_put(struct erofs_domain *domain)
+{
+ if (!domain)
+ return;
+ mutex_lock(&erofs_domain_list_lock);
+ if (refcount_dec_and_test(&domain->ref)) {
+ list_del(&domain->list);
+ if (list_empty(&erofs_domain_list)) {
+ kern_unmount(erofs_pseudo_mnt);
+ erofs_pseudo_mnt = NULL;
+ }
+ mutex_unlock(&erofs_domain_list_lock);
+ fscache_relinquish_volume(domain->volume, NULL, false);
+ kfree(domain->domain_id);
+ kfree(domain);
+ return;
+ }
+ mutex_unlock(&erofs_domain_list_lock);
+}
+
+static int erofs_fscache_register_volume(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *domain_id = sbi->opt.domain_id;
+ struct fscache_volume *volume;
+ char *name;
+ int ret = 0;
+
+ name = kasprintf(GFP_KERNEL, "erofs,%s",
+ domain_id ? domain_id : sbi->opt.fsid);
+ if (!name)
+ return -ENOMEM;
+
+ volume = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(volume)) {
+ erofs_err(sb, "failed to register volume for %s", name);
+ ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+ volume = NULL;
+ }
+
+ sbi->volume = volume;
+ kfree(name);
+ return ret;
+}
+
+static int erofs_fscache_init_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL);
+ if (!domain)
+ return -ENOMEM;
+
+ domain->domain_id = kstrdup(sbi->opt.domain_id, GFP_KERNEL);
+ if (!domain->domain_id) {
+ kfree(domain);
+ return -ENOMEM;
+ }
+
+ err = erofs_fscache_register_volume(sb);
+ if (err)
+ goto out;
+
+ if (!erofs_pseudo_mnt) {
+ erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
+ if (IS_ERR(erofs_pseudo_mnt)) {
+ err = PTR_ERR(erofs_pseudo_mnt);
+ goto out;
+ }
+ }
+
+ domain->volume = sbi->volume;
+ refcount_set(&domain->ref, 1);
+ list_add(&domain->list, &erofs_domain_list);
+ sbi->domain = domain;
+ return 0;
+out:
+ kfree(domain->domain_id);
+ kfree(domain);
+ return err;
+}
+
+static int erofs_fscache_register_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_lock(&erofs_domain_list_lock);
+ list_for_each_entry(domain, &erofs_domain_list, list) {
+ if (!strcmp(domain->domain_id, sbi->opt.domain_id)) {
+ sbi->domain = domain;
+ sbi->volume = domain->volume;
+ refcount_inc(&domain->ref);
+ mutex_unlock(&erofs_domain_list_lock);
+ return 0;
+ }
+ }
+ err = erofs_fscache_init_domain(sb);
+ mutex_unlock(&erofs_domain_list_lock);
+ return err;
+}
+
+static
+struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ struct fscache_volume *volume = EROFS_SB(sb)->volume;
+ struct erofs_fscache *ctx;
+ struct fscache_cookie *cookie;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
+ name, strlen(name), NULL, 0, 0);
+ if (!cookie) {
+ erofs_err(sb, "failed to get cookie for %s", name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ fscache_use_cookie(cookie, false);
+ ctx->cookie = cookie;
+
+ if (need_inode) {
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode) {
+ erofs_err(sb, "failed to get anon inode for %s", name);
+ ret = -ENOMEM;
+ goto err_cookie;
+ }
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+
+ ctx->inode = inode;
+ }
+
+ return ctx;
+
+err_cookie:
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+err:
+ kfree(ctx);
+ return ERR_PTR(ret);
+}
+
+static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
+{
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+ iput(ctx->inode);
+ kfree(ctx->name);
+ kfree(ctx);
+}
+
+static
+struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ int err;
+ struct inode *inode;
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
+
+ ctx = erofs_fscache_acquire_cookie(sb, name, need_inode);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ ctx->name = kstrdup(name, GFP_KERNEL);
+ if (!ctx->name) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ inode = new_inode(erofs_pseudo_mnt->mnt_sb);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ctx->domain = domain;
+ ctx->anon_inode = inode;
+ inode->i_private = ctx;
+ refcount_inc(&domain->ref);
+ return ctx;
+out:
+ erofs_fscache_relinquish_cookie(ctx);
+ return ERR_PTR(err);
+}
+
+static
+struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ struct inode *inode;
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
+ struct super_block *psb = erofs_pseudo_mnt->mnt_sb;
+
+ mutex_lock(&erofs_domain_cookies_lock);
+ list_for_each_entry(inode, &psb->s_inodes, i_sb_list) {
+ ctx = inode->i_private;
+ if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
+ continue;
+ igrab(inode);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+ }
+ ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+}
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ if (EROFS_SB(sb)->opt.domain_id)
+ return erofs_domain_register_cookie(sb, name, need_inode);
+ return erofs_fscache_acquire_cookie(sb, name, need_inode);
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
+{
+ bool drop;
+ struct erofs_domain *domain;
+
+ if (!ctx)
+ return;
+ domain = ctx->domain;
+ if (domain) {
+ mutex_lock(&erofs_domain_cookies_lock);
+ drop = atomic_read(&ctx->anon_inode->i_count) == 1;
+ iput(ctx->anon_inode);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ if (!drop)
+ return;
+ }
+
+ erofs_fscache_relinquish_cookie(ctx);
+ erofs_fscache_domain_put(domain);
+}
+
+int erofs_fscache_register_fs(struct super_block *sb)
+{
+ int ret;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
+
+ if (sbi->opt.domain_id)
+ ret = erofs_fscache_register_domain(sb);
+ else
+ ret = erofs_fscache_register_volume(sb);
+ if (ret)
+ return ret;
+
+ /* acquired domain/volume will be relinquished in kill_sb() on error */
+ fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+
+ sbi->s_fscache = fscache;
+ return 0;
+}
+
+void erofs_fscache_unregister_fs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ erofs_fscache_unregister_cookie(sbi->s_fscache);
+
+ if (sbi->domain)
+ erofs_fscache_domain_put(sbi->domain);
+ else
+ fscache_relinquish_volume(sbi->volume, NULL, false);
+
+ sbi->s_fscache = NULL;
+ sbi->volume = NULL;
+ sbi->domain = NULL;
+}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index e8b37ba5e9ad..ad2a82f2eb4c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -8,11 +8,6 @@
#include <trace/events/erofs.h>
-/*
- * if inode is successfully read, return its inode page (or sometimes
- * the inode payload page if it's an extended inode) in order to fill
- * inline data if possible.
- */
static void *erofs_read_inode(struct erofs_buf *buf,
struct inode *inode, unsigned int *ofs)
{
@@ -219,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
/* if it cannot be handled with fast symlink scheme */
if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= EROFS_BLKSIZ) {
+ inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) {
inode->i_op = &erofs_symlink_iops;
return 0;
}
@@ -246,7 +241,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
return 0;
}
-static int erofs_fill_inode(struct inode *inode, int isdir)
+static int erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
@@ -254,7 +249,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
unsigned int ofs;
int err = 0;
- trace_erofs_fill_inode(inode, isdir);
+ trace_erofs_fill_inode(inode);
/* read inode base data from disk */
kaddr = erofs_read_inode(&buf, inode, &ofs);
@@ -293,10 +288,17 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
}
if (erofs_inode_is_data_compressed(vi->datalayout)) {
- err = z_erofs_fill_inode(inode);
+ if (!erofs_is_fscache_mode(inode->i_sb))
+ err = z_erofs_fill_inode(inode);
+ else
+ err = -EOPNOTSUPP;
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
out_unlock:
erofs_put_metabuf(&buf);
@@ -322,21 +324,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque)
return 0;
}
-static inline struct inode *erofs_iget_locked(struct super_block *sb,
- erofs_nid_t nid)
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
{
const unsigned long hashval = erofs_inode_hash(nid);
+ struct inode *inode;
- return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+ inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor,
erofs_iget_set_actor, &nid);
-}
-
-struct inode *erofs_iget(struct super_block *sb,
- erofs_nid_t nid,
- bool isdir)
-{
- struct inode *inode = erofs_iget_locked(sb, nid);
-
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -346,10 +340,10 @@ struct inode *erofs_iget(struct super_block *sb,
vi->nid = nid;
- err = erofs_fill_inode(inode, isdir);
- if (!err)
+ err = erofs_fill_inode(inode);
+ if (!err) {
unlock_new_inode(inode);
- else {
+ } else {
iget_failed(inode);
inode = ERR_PTR(err);
}
@@ -370,7 +364,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5298c4ee277d..1701df48c446 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,6 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
+ struct erofs_fscache *fscache;
struct block_device *bdev;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -74,6 +75,8 @@ struct erofs_mount_opts {
unsigned int max_sync_decompress_pages;
#endif
unsigned int mount_opt;
+ char *fsid;
+ char *domain_id;
};
struct erofs_dev_context {
@@ -96,6 +99,21 @@ struct erofs_sb_lz4_info {
u16 max_pclusterblks;
};
+struct erofs_domain {
+ refcount_t ref;
+ struct list_head list;
+ struct fscache_volume *volume;
+ char *domain_id;
+};
+
+struct erofs_fscache {
+ struct fscache_cookie *cookie;
+ struct inode *inode;
+ struct inode *anon_inode;
+ struct erofs_domain *domain;
+ char *name;
+};
+
struct erofs_sb_info {
struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
@@ -113,6 +131,7 @@ struct erofs_sb_info {
struct inode *managed_cache;
struct erofs_sb_lz4_info lz4;
+ struct inode *packed_inode;
#endif /* CONFIG_EROFS_FS_ZIP */
struct erofs_dev_context *devs;
struct dax_device *dax_dev;
@@ -146,6 +165,11 @@ struct erofs_sb_info {
/* sysfs support */
struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
struct completion s_kobj_unregister;
+
+ /* fscache support */
+ struct fscache_volume *volume;
+ struct erofs_fscache *s_fscache;
+ struct erofs_domain *domain;
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -161,13 +185,17 @@ struct erofs_sb_info {
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+static inline bool erofs_is_fscache_mode(struct super_block *sb)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+}
+
enum {
EROFS_ZIP_CACHE_DISABLED,
EROFS_ZIP_CACHE_READAHEAD,
EROFS_ZIP_CACHE_READAROUND
};
-#ifdef CONFIG_EROFS_FS_ZIP
#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
/* basic unit of the workstation of a super_block */
@@ -179,7 +207,6 @@ struct erofs_workgroup {
atomic_t refcount;
};
-#if defined(CONFIG_SMP)
static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
int val)
{
@@ -208,35 +235,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
return atomic_cond_read_relaxed(&grp->refcount,
VAL != EROFS_LOCKED_MAGIC);
}
-#else
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- /* no need to spin on UP platforms, let's just disable preemption. */
- if (val != atomic_read(&grp->refcount)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- int v = atomic_read(&grp->refcount);
-
- /* workgroup is never freezed on uniprocessor systems */
- DBG_BUGON(v == EROFS_LOCKED_MAGIC);
- return v;
-}
-#endif /* !CONFIG_SMP */
-#endif /* !CONFIG_EROFS_FS_ZIP */
/* we strictly follow PAGE_SIZE and no buffer head yet */
#define LOG_BLOCK_SIZE PAGE_SHIFT
@@ -290,6 +288,8 @@ EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE)
EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
+EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
+EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
/* atomic flag definitions */
@@ -325,8 +325,13 @@ struct erofs_inode {
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
unsigned long z_tailextent_headlcn;
- erofs_off_t z_idataoff;
- unsigned short z_idata_size;
+ union {
+ struct {
+ erofs_off_t z_idataoff;
+ unsigned short z_idata_size;
+ };
+ erofs_off_t z_fragmentoff;
+ };
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -377,38 +382,16 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
}
extern const struct super_operations erofs_sops;
+extern struct file_system_type erofs_fs_type;
extern const struct address_space_operations erofs_raw_access_aops;
extern const struct address_space_operations z_erofs_aops;
-/*
- * Logical to physical block mapping
- *
- * Different with other file systems, it is used for 2 access modes:
- *
- * 1) RAW access mode:
- *
- * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
- * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
- *
- * Note that m_lblk in the RAW access mode refers to the number of
- * the compressed ondisk block rather than the uncompressed
- * in-memory block for the compressed file.
- *
- * m_pofs equals to m_lofs except for the inline data page.
- *
- * 2) Normal access mode:
- *
- * If the inode is not compressed, it has no difference with
- * the RAW access mode. However, if the inode is compressed,
- * users should pass a valid (m_lblk, m_lofs) pair, and get
- * the needed m_pblk, m_pofs, m_len to get the compressed data
- * and the updated m_lblk, m_lofs which indicates the start
- * of the corresponding uncompressed data in the file.
- */
enum {
BH_Encoded = BH_PrivateStart,
BH_FullMapped,
+ BH_Fragment,
+ BH_Partialref,
};
/* Has a disk mapping */
@@ -419,6 +402,10 @@ enum {
#define EROFS_MAP_ENCODED (1 << BH_Encoded)
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
+/* Located in the special packed inode */
+#define EROFS_MAP_FRAGMENT (1 << BH_Fragment)
+/* The extent refers to partial decompressed data */
+#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref)
struct erofs_map_blocks {
struct erofs_buf buf;
@@ -440,11 +427,12 @@ struct erofs_map_blocks {
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
/* Used to map the whole extent if non-negligible data is requested for LZMA */
#define EROFS_GET_BLOCKS_READMORE 0x0004
-/* Used to map tail extent for tailpacking inline pcluster */
+/* Used to map tail extent for tailpacking inline or fragment pcluster */
#define EROFS_GET_BLOCKS_FINDTAIL 0x0008
enum {
Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_INTERLACED,
Z_EROFS_COMPRESSION_RUNTIME_MAX
};
@@ -467,6 +455,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
#endif /* !CONFIG_EROFS_FS_ZIP */
struct erofs_map_dev {
+ struct erofs_fscache *m_fscache;
struct block_device *m_bdev;
struct dax_device *m_daxdev;
u64 m_dax_part_off;
@@ -486,6 +475,8 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags);
/* inode.c */
static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
@@ -501,7 +492,7 @@ extern const struct inode_operations erofs_generic_iops;
extern const struct inode_operations erofs_symlink_iops;
extern const struct inode_operations erofs_fast_symlink_iops;
-struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags);
@@ -509,7 +500,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
/* namei.c */
extern const struct inode_operations erofs_dir_iops;
-int erofs_namei(struct inode *dir, struct qstr *name,
+int erofs_namei(struct inode *dir, const struct qstr *name,
erofs_nid_t *nid, unsigned int *d_type);
/* dir.c */
@@ -611,6 +602,35 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+/* fscache.c */
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+int erofs_fscache_register_fs(struct super_block *sb);
+void erofs_fscache_unregister_fs(struct super_block *sb);
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, bool need_inode);
+void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+
+extern const struct address_space_operations erofs_fscache_access_aops;
+#else
+static inline int erofs_fscache_register_fs(struct super_block *sb)
+{
+ return -EOPNOTSUPP;
+}
+static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
+
+static inline
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
+{
+}
+#endif
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 554efa363317..0dc34721080c 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -165,9 +165,8 @@ out: /* free if the candidate is valid */
return candidate;
}
-int erofs_namei(struct inode *dir,
- struct qstr *name,
- erofs_nid_t *nid, unsigned int *d_type)
+int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
+ unsigned int *d_type)
{
int ndirents;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
@@ -186,7 +185,6 @@ int erofs_namei(struct inode *dir,
if (IS_ERR(de))
return PTR_ERR(de);
- /* the target page has been mapped */
if (ndirents)
de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
@@ -198,9 +196,7 @@ int erofs_namei(struct inode *dir,
return PTR_ERR_OR_ZERO(de);
}
-/* NOTE: i_mutex is already held by vfs */
-static struct dentry *erofs_lookup(struct inode *dir,
- struct dentry *dentry,
+static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
int err;
@@ -208,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir,
unsigned int d_type;
struct inode *inode;
- DBG_BUGON(!d_really_is_negative(dentry));
- /* dentry must be unhashed in lookup, no need to worry about */
- DBG_BUGON(!d_unhashed(dentry));
-
trace_erofs_lookup(dir, dentry, flags);
- /* file name exceeds fs limit */
if (dentry->d_name.len > EROFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- /* false uninitialized warnings on gcc 4.8.x */
err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
if (err == -ENOENT) {
@@ -229,7 +219,7 @@ static struct dentry *erofs_lookup(struct inode *dir,
} else {
erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__,
dentry, nid, d_type);
- inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
+ inode = erofs_iget(dir->i_sb, nid);
}
return d_splice_alias(inode, dentry);
}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0c4b41130c2f..2cf96ce1c32e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -13,6 +13,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/dax.h>
+#include <linux/exportfs.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -219,7 +220,53 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
}
#endif
-static int erofs_init_devices(struct super_block *sb,
+static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
+ struct erofs_device_info *dif, erofs_off_t *pos)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
+ struct erofs_deviceslot *dis;
+ struct block_device *bdev;
+ void *ptr;
+
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ dis = ptr + erofs_blkoff(*pos);
+
+ if (!dif->path) {
+ if (!dis->tag[0]) {
+ erofs_err(sb, "empty device tag @ pos %llu", *pos);
+ return -EINVAL;
+ }
+ dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL);
+ if (!dif->path)
+ return -ENOMEM;
+ }
+
+ if (erofs_is_fscache_mode(sb)) {
+ fscache = erofs_fscache_register_cookie(sb, dif->path, false);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+ dif->fscache = fscache;
+ } else {
+ bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+ NULL, NULL);
+ }
+
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ *pos += EROFS_DEVT_SLOT_SIZE;
+ return 0;
+}
+
+static int erofs_scan_devices(struct super_block *sb,
struct erofs_super_block *dsb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -227,8 +274,6 @@ static int erofs_init_devices(struct super_block *sb,
erofs_off_t pos;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_device_info *dif;
- struct erofs_deviceslot *dis;
- void *ptr;
int id, err = 0;
sbi->total_blocks = sbi->primarydevice_blocks;
@@ -237,7 +282,8 @@ static int erofs_init_devices(struct super_block *sb,
else
ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
- if (ondisk_extradevs != sbi->devs->extra_devices) {
+ if (sbi->devs->extra_devices &&
+ ondisk_extradevs != sbi->devs->extra_devices) {
erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
ondisk_extradevs, sbi->devs->extra_devices);
return -EINVAL;
@@ -248,30 +294,31 @@ static int erofs_init_devices(struct super_block *sb,
sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
down_read(&sbi->devs->rwsem);
- idr_for_each_entry(&sbi->devs->tree, dif, id) {
- struct block_device *bdev;
-
- ptr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
- EROFS_KMAP);
- if (IS_ERR(ptr)) {
- err = PTR_ERR(ptr);
- break;
+ if (sbi->devs->extra_devices) {
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
}
- dis = ptr + erofs_blkoff(pos);
-
- bdev = blkdev_get_by_path(dif->path,
- FMODE_READ | FMODE_EXCL,
- sb->s_type);
- if (IS_ERR(bdev)) {
- err = PTR_ERR(bdev);
- break;
+ } else {
+ for (id = 0; id < ondisk_extradevs; id++) {
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif) {
+ err = -ENOMEM;
+ break;
+ }
+
+ err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ if (err < 0) {
+ kfree(dif);
+ break;
+ }
+ ++sbi->devs->extra_devices;
+
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
}
- dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
- dif->blocks = le32_to_cpu(dis->blocks);
- dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
- sbi->total_blocks += dif->blocks;
- pos += EROFS_DEVT_SLOT_SIZE;
}
up_read(&sbi->devs->rwsem);
erofs_put_metabuf(&buf);
@@ -334,6 +381,17 @@ static int erofs_read_superblock(struct super_block *sb)
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
sbi->root_nid = le16_to_cpu(dsb->root_nid);
+#ifdef CONFIG_EROFS_FS_ZIP
+ sbi->packed_inode = NULL;
+ if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) {
+ sbi->packed_inode =
+ erofs_iget(sb, le64_to_cpu(dsb->packed_nid));
+ if (IS_ERR(sbi->packed_inode)) {
+ ret = PTR_ERR(sbi->packed_inode);
+ goto out;
+ }
+ }
+#endif
sbi->inos = le64_to_cpu(dsb->inos);
sbi->build_time = le64_to_cpu(dsb->build_time);
@@ -358,10 +416,16 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
/* handle multiple devices */
- ret = erofs_init_devices(sb, dsb);
+ ret = erofs_scan_devices(sb, dsb);
if (erofs_sb_has_ztailpacking(sbi))
erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
+ if (erofs_is_fscache_mode(sb))
+ erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+ if (erofs_sb_has_fragments(sbi))
+ erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
+ if (erofs_sb_has_dedupe(sbi))
+ erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
@@ -390,6 +454,8 @@ enum {
Opt_dax,
Opt_dax_enum,
Opt_device,
+ Opt_fsid,
+ Opt_domain_id,
Opt_err
};
@@ -414,6 +480,8 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
fsparam_string("device", Opt_device),
+ fsparam_string("fsid", Opt_fsid),
+ fsparam_string("domain_id", Opt_domain_id),
{}
};
@@ -509,6 +577,26 @@ static int erofs_fc_parse_param(struct fs_context *fc,
}
++ctx->devs->extra_devices;
break;
+ case Opt_fsid:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ kfree(ctx->opt.fsid);
+ ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->opt.fsid)
+ return -ENOMEM;
+#else
+ errorfc(fc, "fsid option not supported");
+#endif
+ break;
+ case Opt_domain_id:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ kfree(ctx->opt.domain_id);
+ ctx->opt.domain_id = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->opt.domain_id)
+ return -ENOMEM;
+#else
+ errorfc(fc, "domain_id option not supported");
+#endif
+ break;
default:
return -ENOPARAM;
}
@@ -518,16 +606,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
#ifdef CONFIG_EROFS_FS_ZIP
static const struct address_space_operations managed_cache_aops;
-static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp)
{
- int ret = 1; /* 0 - busy */
- struct address_space *const mapping = page->mapping;
+ bool ret = true;
+ struct address_space *const mapping = folio->mapping;
- DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(!folio_test_locked(folio));
DBG_BUGON(mapping->a_ops != &managed_cache_aops);
- if (PagePrivate(page))
- ret = erofs_try_to_free_cached_page(page);
+ if (folio_test_private(folio))
+ ret = erofs_try_to_free_cached_page(&folio->page);
return ret;
}
@@ -548,12 +636,12 @@ static void erofs_managed_cache_invalidate_folio(struct folio *folio,
DBG_BUGON(stop > folio_size(folio) || stop < length);
if (offset == 0 && stop == folio_size(folio))
- while (!erofs_managed_cache_releasepage(&folio->page, GFP_NOFS))
+ while (!erofs_managed_cache_release_folio(folio, GFP_NOFS))
cond_resched();
}
static const struct address_space_operations managed_cache_aops = {
- .releasepage = erofs_managed_cache_releasepage,
+ .release_folio = erofs_managed_cache_release_folio,
.invalidate_folio = erofs_managed_cache_invalidate_folio,
};
@@ -577,6 +665,51 @@ static int erofs_init_managed_cache(struct super_block *sb)
static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif
+static struct inode *erofs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ return erofs_iget(sb, ino);
+}
+
+static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_get_parent(struct dentry *child)
+{
+ erofs_nid_t nid;
+ unsigned int d_type;
+ int err;
+
+ err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
+ if (err)
+ return ERR_PTR(err);
+ return d_obtain_alias(erofs_iget(child->d_sb, nid));
+}
+
+static const struct export_operations erofs_export_ops = {
+ .fh_to_dentry = erofs_fh_to_dentry,
+ .fh_to_parent = erofs_fh_to_parent,
+ .get_parent = erofs_get_parent,
+};
+
+static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
+{
+ static const struct tree_descr empty_descr = {""};
+
+ return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
+}
+
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
@@ -585,11 +718,9 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
int err;
sb->s_magic = EROFS_SUPER_MAGIC;
-
- if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
- erofs_err(sb, "failed to set erofs blksize");
- return -EINVAL;
- }
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &erofs_sops;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -597,10 +728,33 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_fs_info = sbi;
sbi->opt = ctx->opt;
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off);
+ ctx->opt.fsid = NULL;
+ ctx->opt.domain_id = NULL;
sbi->devs = ctx->devs;
ctx->devs = NULL;
+ if (erofs_is_fscache_mode(sb)) {
+ sb->s_blocksize = EROFS_BLKSIZ;
+ sb->s_blocksize_bits = LOG_BLOCK_SIZE;
+
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+
+ err = super_setup_bdi(sb);
+ if (err)
+ return err;
+ } else {
+ if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+ erofs_err(sb, "failed to set erofs blksize");
+ return -EINVAL;
+ }
+
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dax_part_off,
+ NULL, NULL);
+ }
+
err = erofs_read_superblock(sb);
if (err)
return err;
@@ -613,12 +767,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
clear_opt(&sbi->opt, DAX_ALWAYS);
}
}
- sb->s_flags |= SB_RDONLY | SB_NOATIME;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
- sb->s_op = &erofs_sops;
+ sb->s_time_gran = 1;
sb->s_xattr = erofs_xattr_handlers;
+ sb->s_export_op = &erofs_export_ops;
if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
@@ -630,7 +782,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
/* get the root inode */
- inode = erofs_iget(sb, ROOT_NID(sbi), true);
+ inode = erofs_iget(sb, ROOT_NID(sbi));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -659,8 +811,18 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return 0;
}
+static int erofs_fc_anon_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
+}
+
static int erofs_fc_get_tree(struct fs_context *fc)
{
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+
return get_tree_bdev(fc, erofs_fc_fill_super);
}
@@ -687,9 +849,11 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
{
struct erofs_device_info *dif = ptr;
- fs_put_dax(dif->dax_dev);
+ fs_put_dax(dif->dax_dev, NULL);
if (dif->bdev)
blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ erofs_fscache_unregister_cookie(dif->fscache);
+ dif->fscache = NULL;
kfree(dif->path);
kfree(dif);
return 0;
@@ -709,6 +873,8 @@ static void erofs_fc_free(struct fs_context *fc)
struct erofs_fs_context *ctx = fc->fs_private;
erofs_free_dev_context(ctx->devs);
+ kfree(ctx->opt.fsid);
+ kfree(ctx->opt.domain_id);
kfree(ctx);
}
@@ -719,10 +885,21 @@ static const struct fs_context_operations erofs_context_ops = {
.free = erofs_fc_free,
};
+static const struct fs_context_operations erofs_anon_context_ops = {
+ .get_tree = erofs_fc_anon_get_tree,
+};
+
static int erofs_init_fs_context(struct fs_context *fc)
{
- struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ struct erofs_fs_context *ctx;
+
+ /* pseudo mount for anon inodes */
+ if (fc->sb_flags & SB_KERNMOUNT) {
+ fc->ops = &erofs_anon_context_ops;
+ return 0;
+ }
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
@@ -749,14 +926,26 @@ static void erofs_kill_sb(struct super_block *sb)
WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
- kill_block_super(sb);
+ /* pseudo mount for anon inodes */
+ if (sb->s_flags & SB_KERNMOUNT) {
+ kill_anon_super(sb);
+ return;
+ }
+
+ if (erofs_is_fscache_mode(sb))
+ kill_anon_super(sb);
+ else
+ kill_block_super(sb);
sbi = EROFS_SB(sb);
if (!sbi)
return;
erofs_free_dev_context(sbi->devs);
- fs_put_dax(sbi->dax_dev);
+ fs_put_dax(sbi->dax_dev, NULL);
+ erofs_fscache_unregister_fs(sb);
+ kfree(sbi->opt.fsid);
+ kfree(sbi->opt.domain_id);
kfree(sbi);
sb->s_fs_info = NULL;
}
@@ -773,15 +962,18 @@ static void erofs_put_super(struct super_block *sb)
#ifdef CONFIG_EROFS_FS_ZIP
iput(sbi->managed_cache);
sbi->managed_cache = NULL;
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
#endif
+ erofs_fscache_unregister_fs(sb);
}
-static struct file_system_type erofs_fs_type = {
+struct file_system_type erofs_fs_type = {
.owner = THIS_MODULE,
.name = "erofs",
.init_fs_context = erofs_init_fs_context,
.kill_sb = erofs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("erofs");
@@ -857,7 +1049,10 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ u64 id = 0;
+
+ if (!erofs_is_fscache_mode(sb))
+ id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
@@ -902,6 +1097,12 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",dax=always");
if (test_opt(opt, DAX_NEVER))
seq_puts(seq, ",dax=never");
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (opt->fsid)
+ seq_printf(seq, ",fsid=%s", opt->fsid);
+ if (opt->domain_id)
+ seq_printf(seq, ",domain_id=%s", opt->domain_id);
+#endif
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index f3babf1e6608..783bb7b21b51 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -76,6 +76,8 @@ EROFS_ATTR_FEATURE(device_table);
EROFS_ATTR_FEATURE(compr_head2);
EROFS_ATTR_FEATURE(sb_chksum);
EROFS_ATTR_FEATURE(ztailpacking);
+EROFS_ATTR_FEATURE(fragments);
+EROFS_ATTR_FEATURE(dedupe);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -86,6 +88,8 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(compr_head2),
ATTR_LIST(sb_chksum),
ATTR_LIST(ztailpacking),
+ ATTR_LIST(fragments),
+ ATTR_LIST(dedupe),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
@@ -201,12 +205,27 @@ static struct kobject erofs_feat = {
int erofs_register_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *name;
+ char *str = NULL;
int err;
+ if (erofs_is_fscache_mode(sb)) {
+ if (sbi->opt.domain_id) {
+ str = kasprintf(GFP_KERNEL, "%s,%s", sbi->opt.domain_id,
+ sbi->opt.fsid);
+ if (!str)
+ return -ENOMEM;
+ name = str;
+ } else {
+ name = sbi->opt.fsid;
+ }
+ } else {
+ name = sb->s_id;
+ }
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL,
- "%s", sb->s_id);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
+ kfree(str);
if (err)
goto put_sb_kobj;
return 0;
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index ec9a1d780dc1..46627cb69abe 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = {
int __init erofs_init_shrinker(void)
{
- return register_shrinker(&erofs_shrinker_info);
+ return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
}
void erofs_exit_shrinker(void)
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 332462c59f11..0a43c9ee9f8f 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -39,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
#ifdef CONFIG_EROFS_FS_XATTR
extern const struct xattr_handler erofs_xattr_user_handler;
extern const struct xattr_handler erofs_xattr_trusted_handler;
-#ifdef CONFIG_EROFS_FS_SECURITY
extern const struct xattr_handler erofs_xattr_security_handler;
-#endif
static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
{
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 0ed880f42525..559380a535af 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -2,10 +2,12 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022 Alibaba Cloud
*/
#include "zdata.h"
#include "compress.h"
#include <linux/prefetch.h>
+#include <linux/psi.h>
#include <trace/events/erofs.h>
@@ -26,6 +28,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
_PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
};
+struct z_erofs_bvec_iter {
+ struct page *bvpage;
+ struct z_erofs_bvset *bvset;
+ unsigned int nr, cur;
+};
+
+static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
+{
+ if (iter->bvpage)
+ kunmap_local(iter->bvset);
+ return iter->bvpage;
+}
+
+static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
+{
+ unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
+ /* have to access nextpage in advance, otherwise it will be unmapped */
+ struct page *nextpage = iter->bvset->nextpage;
+ struct page *oldpage;
+
+ DBG_BUGON(!nextpage);
+ oldpage = z_erofs_bvec_iter_end(iter);
+ iter->bvpage = nextpage;
+ iter->bvset = kmap_local_page(nextpage);
+ iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
+ iter->cur = 0;
+ return oldpage;
+}
+
+static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvset_inline *bvset,
+ unsigned int bootstrap_nr,
+ unsigned int cur)
+{
+ *iter = (struct z_erofs_bvec_iter) {
+ .nr = bootstrap_nr,
+ .bvset = (struct z_erofs_bvset *)bvset,
+ };
+
+ while (cur > iter->nr) {
+ cur -= iter->nr;
+ z_erofs_bvset_flip(iter);
+ }
+ iter->cur = cur;
+}
+
+static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **candidate_bvpage)
+{
+ if (iter->cur == iter->nr) {
+ if (!*candidate_bvpage)
+ return -EAGAIN;
+
+ DBG_BUGON(iter->bvset->nextpage);
+ iter->bvset->nextpage = *candidate_bvpage;
+ z_erofs_bvset_flip(iter);
+
+ iter->bvset->nextpage = NULL;
+ *candidate_bvpage = NULL;
+ }
+ iter->bvset->bvec[iter->cur++] = *bvec;
+ return 0;
+}
+
+static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **old_bvpage)
+{
+ if (iter->cur == iter->nr)
+ *old_bvpage = z_erofs_bvset_flip(iter);
+ else
+ *old_bvpage = NULL;
+ *bvec = iter->bvset->bvec[iter->cur++];
+}
+
static void z_erofs_destroy_pcluster_pool(void)
{
int i;
@@ -46,7 +124,7 @@ static int z_erofs_create_pcluster_pool(void)
for (pcs = pcluster_pool;
pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
- size = struct_size(a, compressed_pages, pcs->maxpages);
+ size = struct_size(a, compressed_bvecs, pcs->maxpages);
sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
pcs->slab = kmem_cache_create(pcs->name, size, 0,
@@ -150,30 +228,29 @@ int __init z_erofs_init_zip_subsystem(void)
return err;
}
-enum z_erofs_collectmode {
- COLLECT_SECONDARY,
- COLLECT_PRIMARY,
+enum z_erofs_pclustermode {
+ Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * The current collection was the tail of an exist chain, in addition
- * that the previous processed chained collections are all decided to
+ * The current pclusters was the tail of an exist chain, in addition
+ * that the previous processed chained pclusters are all decided to
* be hooked up to it.
- * A new chain will be created for the remaining collections which are
- * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
- * the next collection cannot reuse the whole page safely in
- * the following scenario:
+ * A new chain will be created for the remaining pclusters which are
+ * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
+ * the next pcluster cannot reuse the whole page safely for inplace I/O
+ * in the following scenario:
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
- * | (belongs to the next cl) | (belongs to the current cl) |
- * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+ * | (belongs to the next pcl) | (belongs to the current pcl) |
+ * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
*/
- COLLECT_PRIMARY_HOOKED,
+ Z_EROFS_PCLUSTER_HOOKED,
/*
- * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it
+ * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
* could be dispatched into bypass queue later due to uptodated managed
* pages. All related online pages cannot be reused for inplace I/O (or
- * pagevec) since it can be directly decoded without I/O submission.
+ * bvpage) since it can be directly decoded without I/O submission.
*/
- COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
+ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
* The current collection has been linked with the owned chain, and
* could also be linked with the remaining collections, which means
@@ -184,40 +261,36 @@ enum z_erofs_collectmode {
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) |
- * | PRIMARY_FOLLOWED or | |
- * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
+ * | PCLUSTER_FOLLOWED or | |
+ * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
*
* [ (*) the above page can be used as inplace I/O. ]
*/
- COLLECT_PRIMARY_FOLLOWED,
+ Z_EROFS_PCLUSTER_FOLLOWED,
};
struct z_erofs_decompress_frontend {
struct inode *const inode;
struct erofs_map_blocks map;
+ struct z_erofs_bvec_iter biter;
- struct z_erofs_pagevec_ctor vector;
-
+ struct page *candidate_bvpage;
struct z_erofs_pcluster *pcl, *tailpcl;
- struct z_erofs_collection *cl;
- /* a pointer used to pick up inplace I/O pages */
- struct page **icpage_ptr;
z_erofs_next_pcluster_t owned_head;
-
- enum z_erofs_collectmode mode;
+ enum z_erofs_pclustermode mode;
bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
+
+ /* a pointer used to pick up inplace I/O pages */
+ unsigned int icur;
};
#define DECOMPRESS_FRONTEND_INIT(__i) { \
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED }
-
-static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
-static DEFINE_MUTEX(z_pagemap_global_lock);
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
enum z_erofs_cache_alloctype type,
@@ -232,24 +305,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
*/
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
- struct page **pages;
- pgoff_t index;
+ unsigned int i;
- if (fe->mode < COLLECT_PRIMARY_FOLLOWED)
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
- pages = pcl->compressed_pages;
- index = pcl->obj.index;
- for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) {
+ for (i = 0; i < pcl->pclusterpages; ++i) {
struct page *page;
compressed_page_t t;
struct page *newpage = NULL;
/* the compressed page was loaded before */
- if (READ_ONCE(*pages))
+ if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, index);
+ page = find_get_page(mc, pcl->obj.index + i);
if (page) {
t = tag_compressed_page_justfound(page);
@@ -270,7 +340,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
}
}
- if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
+ if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
+ tagptr_cast_ptr(t)))
continue;
if (page)
@@ -284,7 +355,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
* managed cache since it can be moved to the bypass queue instead.
*/
if (standalone)
- fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
/* called by erofs_shrinker to get rid of all compressed_pages */
@@ -301,7 +372,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
* therefore no need to worry about available decompression users.
*/
for (i = 0; i < pcl->pclusterpages; ++i) {
- struct page *page = pcl->compressed_pages[i];
+ struct page *page = pcl->compressed_bvecs[i].page;
if (!page)
continue;
@@ -314,7 +385,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
continue;
/* barrier is implied in the following 'unlock_page' */
- WRITE_ONCE(pcl->compressed_pages[i], NULL);
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
detach_page_private(page);
unlock_page(page);
}
@@ -324,56 +395,59 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
int erofs_try_to_free_cached_page(struct page *page)
{
struct z_erofs_pcluster *const pcl = (void *)page_private(page);
- int ret = 0; /* 0 - busy */
+ int ret, i;
- if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
- unsigned int i;
+ if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
+ return 0;
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pcl->pclusterpages; ++i) {
- if (pcl->compressed_pages[i] == page) {
- WRITE_ONCE(pcl->compressed_pages[i], NULL);
- ret = 1;
- break;
- }
+ ret = 0;
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ for (i = 0; i < pcl->pclusterpages; ++i) {
+ if (pcl->compressed_bvecs[i].page == page) {
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ ret = 1;
+ break;
}
- erofs_workgroup_unfreeze(&pcl->obj, 1);
-
- if (ret)
- detach_page_private(page);
}
+ erofs_workgroup_unfreeze(&pcl->obj, 1);
+ if (ret)
+ detach_page_private(page);
return ret;
}
-/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+ struct z_erofs_bvec *bvec)
{
struct z_erofs_pcluster *const pcl = fe->pcl;
- while (fe->icpage_ptr > pcl->compressed_pages)
- if (!cmpxchg(--fe->icpage_ptr, NULL, page))
+ while (fe->icur > 0) {
+ if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
+ NULL, bvec->page)) {
+ pcl->compressed_bvecs[fe->icur] = *bvec;
return true;
+ }
+ }
return false;
}
-/* callers must be with collection lock held */
+/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, enum z_erofs_page_type type,
- bool pvec_safereuse)
+ struct z_erofs_bvec *bvec, bool exclusive)
{
int ret;
- /* give priority for inplaceio */
- if (fe->mode >= COLLECT_PRIMARY &&
- type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- z_erofs_try_inplace_io(fe, page))
- return 0;
-
- ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
- pvec_safereuse);
- fe->cl->vcnt += (unsigned int)ret;
- return ret ? 0 : -EAGAIN;
+ if (exclusive) {
+ /* give priority for inplaceio to use file pages first */
+ if (z_erofs_try_inplace_io(fe, bvec))
+ return 0;
+ /* otherwise, check if it can be used as a bvpage */
+ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
+ !fe->candidate_bvpage)
+ fe->candidate_bvpage = bvec->page;
+ }
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+ fe->pcl->vcnt += (ret >= 0);
+ return ret;
}
static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
@@ -386,7 +460,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
*owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
/* so we can attach this pcluster to our submission chain. */
- f->mode = COLLECT_PRIMARY_FOLLOWED;
+ f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
return;
}
@@ -394,72 +468,23 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
* type 2, link to the end of an existing open chain, be careful
* that its submission is controlled by the original attached chain.
*/
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+ if (*owned_head != &pcl->next && pcl != f->tailpcl &&
+ cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- f->mode = COLLECT_PRIMARY_HOOKED;
+ f->mode = Z_EROFS_PCLUSTER_HOOKED;
f->tailpcl = NULL;
return;
}
/* type 3, it belongs to a chain, but it isn't the end of the chain */
- f->mode = COLLECT_PRIMARY;
-}
-
-static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
-{
- struct z_erofs_pcluster *pcl = fe->pcl;
- struct z_erofs_collection *cl;
- unsigned int length;
-
- /* to avoid unexpected loop formed by corrupted images */
- if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- cl = z_erofs_primarycollection(pcl);
- if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- length = READ_ONCE(pcl->length);
- if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
- if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- } else {
- unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
-
- if (map->m_flags & EROFS_MAP_FULL_MAPPED)
- llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
-
- while (llen > length &&
- length != cmpxchg_relaxed(&pcl->length, length, llen)) {
- cpu_relax();
- length = READ_ONCE(pcl->length);
- }
- }
- mutex_lock(&cl->lock);
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = pcl;
-
- z_erofs_try_to_claim_pcluster(fe);
- fe->cl = cl;
- return 0;
+ f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
}
-static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
+ struct erofs_map_blocks *map = &fe->map;
bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl;
- struct z_erofs_collection *cl;
struct erofs_workgroup *grp;
int err;
@@ -476,23 +501,20 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
atomic_set(&pcl->obj.refcount, 1);
pcl->algorithmformat = map->m_algorithmformat;
- pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
- (map->m_flags & EROFS_MAP_FULL_MAPPED ?
- Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
+ pcl->length = 0;
+ pcl->partial = true;
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = fe->owned_head;
- fe->mode = COLLECT_PRIMARY_FOLLOWED;
-
- cl = z_erofs_primarycollection(pcl);
- cl->pageofs = map->m_la & ~PAGE_MASK;
+ pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
/*
* lock all primary followed works before visible to others
* and mutex_trylock *never* fails for a new pcluster.
*/
- mutex_init(&cl->lock);
- DBG_BUGON(!mutex_trylock(&cl->lock));
+ mutex_init(&pcl->lock);
+ DBG_BUGON(!mutex_trylock(&pcl->lock));
if (ztailpacking) {
pcl->obj.index = 0; /* which indicates ztailpacking */
@@ -501,7 +523,7 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
} else {
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
- grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
+ grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
if (IS_ERR(grp)) {
err = PTR_ERR(grp);
goto err_out;
@@ -519,60 +541,55 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
fe->tailpcl = pcl;
fe->owned_head = &pcl->next;
fe->pcl = pcl;
- fe->cl = cl;
return 0;
err_out:
- mutex_unlock(&cl->lock);
+ mutex_unlock(&pcl->lock);
z_erofs_free_pcluster(pcl);
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
{
- struct erofs_workgroup *grp;
+ struct erofs_map_blocks *map = &fe->map;
+ struct erofs_workgroup *grp = NULL;
int ret;
- DBG_BUGON(fe->cl);
+ DBG_BUGON(fe->pcl);
- /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
+ /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (map->m_flags & EROFS_MAP_META) {
- if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- goto tailpacking;
+ if (!(map->m_flags & EROFS_MAP_META)) {
+ grp = erofs_find_workgroup(fe->inode->i_sb,
+ map->m_pa >> PAGE_SHIFT);
+ } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
- grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
if (grp) {
fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ ret = -EEXIST;
} else {
-tailpacking:
- ret = z_erofs_register_collection(fe, inode, map);
- if (!ret)
- goto out;
- if (ret != -EEXIST)
- return ret;
+ ret = z_erofs_register_pcluster(fe);
}
- ret = z_erofs_lookup_collection(fe, inode, map);
- if (ret) {
- erofs_workgroup_put(&fe->pcl->obj);
+ if (ret == -EEXIST) {
+ mutex_lock(&fe->pcl->lock);
+ /* used to check tail merging loop due to corrupted images */
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = fe->pcl;
+
+ z_erofs_try_to_claim_pcluster(fe);
+ } else if (ret) {
return ret;
}
-
-out:
- z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- fe->cl->pagevec, fe->cl->vcnt);
+ z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
+ Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
/* since file-backed online pages are traversed in reverse order */
- fe->icpage_ptr = fe->pcl->compressed_pages +
- z_erofs_pclusterpages(fe->pcl);
+ fe->icur = z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -582,48 +599,41 @@ out:
*/
static void z_erofs_rcu_callback(struct rcu_head *head)
{
- struct z_erofs_collection *const cl =
- container_of(head, struct z_erofs_collection, rcu);
-
- z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster,
- primary_collection));
+ z_erofs_free_pcluster(container_of(head,
+ struct z_erofs_pcluster, rcu));
}
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
- struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
- call_rcu(&cl->rcu, z_erofs_rcu_callback);
-}
-
-static void z_erofs_collection_put(struct z_erofs_collection *cl)
-{
- struct z_erofs_pcluster *const pcl =
- container_of(cl, struct z_erofs_pcluster, primary_collection);
-
- erofs_workgroup_put(&pcl->obj);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
{
- struct z_erofs_collection *cl = fe->cl;
+ struct z_erofs_pcluster *pcl = fe->pcl;
- if (!cl)
+ if (!pcl)
return false;
- z_erofs_pagevec_ctor_exit(&fe->vector, false);
- mutex_unlock(&cl->lock);
+ z_erofs_bvec_iter_end(&fe->biter);
+ mutex_unlock(&pcl->lock);
+
+ if (fe->candidate_bvpage) {
+ DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
+ fe->candidate_bvpage = NULL;
+ }
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
- if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
- z_erofs_collection_put(cl);
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
+ erofs_workgroup_put(&pcl->obj);
- fe->cl = NULL;
+ fe->pcl = NULL;
return true;
}
@@ -641,6 +651,35 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
la < fe->headoffset;
}
+static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
+ struct page *page, unsigned int pageofs,
+ unsigned int len)
+{
+ struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ u8 *src, *dst;
+ unsigned int i, cnt;
+
+ pos += EROFS_I(inode)->z_fragmentoff;
+ for (i = 0; i < len; i += cnt) {
+ cnt = min_t(unsigned int, len - i,
+ EROFS_BLKSIZ - erofs_blkoff(pos));
+ src = erofs_bread(&buf, packed_inode,
+ erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(src)) {
+ erofs_put_metabuf(&buf);
+ return PTR_ERR(src);
+ }
+
+ dst = kmap_local_page(page);
+ memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt);
+ kunmap_local(dst);
+ pos += cnt;
+ }
+ erofs_put_metabuf(&buf);
+ return 0;
+}
+
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct page *page, struct page **pagepool)
{
@@ -648,11 +687,10 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
- bool tight = true;
+ bool tight = true, exclusive;
enum z_erofs_cache_alloctype cache_strategy;
- enum z_erofs_page_type page_type;
- unsigned int cur, end, spiltted, index;
+ unsigned int cur, end, spiltted;
int err = 0;
/* register locked file pages as online pages in pack */
@@ -663,34 +701,30 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
repeat:
cur = end - 1;
- /* lucky, within the range of the current map_blocks */
- if (offset + cur >= map->m_la &&
- offset + cur < map->m_la + map->m_llen) {
- /* didn't get a valid collection previously (very rare) */
- if (!fe->cl)
- goto restart_now;
- goto hitted;
- }
-
- /* go ahead the next map_blocks */
- erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
-
- if (z_erofs_collector_end(fe))
- fe->backmost = false;
+ if (offset + cur < map->m_la ||
+ offset + cur >= map->m_la + map->m_llen) {
+ erofs_dbg("out-of-range map @ pos %llu", offset + cur);
- map->m_la = offset + cur;
- map->m_llen = 0;
- err = z_erofs_map_blocks_iter(inode, map, 0);
- if (err)
- goto err_out;
+ if (z_erofs_collector_end(fe))
+ fe->backmost = false;
+ map->m_la = offset + cur;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ goto out;
+ } else {
+ if (fe->pcl)
+ goto hitted;
+ /* didn't get a valid pcluster previously (very rare) */
+ }
-restart_now:
- if (!(map->m_flags & EROFS_MAP_MAPPED))
+ if (!(map->m_flags & EROFS_MAP_MAPPED) ||
+ map->m_flags & EROFS_MAP_FRAGMENT)
goto hitted;
- err = z_erofs_collector_begin(fe, inode, map);
+ err = z_erofs_collector_begin(fe);
if (err)
- goto err_out;
+ goto out;
if (z_erofs_is_inline_pcluster(fe->pcl)) {
void *mp;
@@ -701,11 +735,12 @@ restart_now:
err = PTR_ERR(mp);
erofs_err(inode->i_sb,
"failed to get inline page, err %d", err);
- goto err_out;
+ goto out;
}
get_page(fe->map.buf.page);
- WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page);
- fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
+ fe->map.buf.page);
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
@@ -721,77 +756,95 @@ hitted:
* Ensure the current partial page belongs to this submit chain rather
* than other concurrent submit chains or the noio(bypass) chain since
* those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or pagevec (should be processed in strict order.)
+ * for inplace I/O or bvpage (should be processed in a strict order.)
*/
- tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED &&
- fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
+ fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
goto next_part;
}
+ if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ unsigned int pageofs, skip, len;
- /* let's derive page type */
- page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
- (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
- (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
- Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+ if (offset > map->m_la) {
+ pageofs = 0;
+ skip = offset - map->m_la;
+ } else {
+ pageofs = map->m_la & ~PAGE_MASK;
+ skip = 0;
+ }
+ len = min_t(unsigned int, map->m_llen - skip, end - cur);
+ err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
+ if (err)
+ goto out;
+ ++spiltted;
+ tight = false;
+ goto next_part;
+ }
+ exclusive = (!cur && (!spiltted || tight));
if (cur)
- tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED);
+ tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
retry:
- err = z_erofs_attach_page(fe, page, page_type,
- fe->mode >= COLLECT_PRIMARY_FOLLOWED);
- /* should allocate an additional short-lived page for pagevec */
- if (err == -EAGAIN) {
- struct page *const newpage =
- alloc_page(GFP_NOFS | __GFP_NOFAIL);
-
- set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
- err = z_erofs_attach_page(fe, newpage,
- Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
- if (!err)
- goto retry;
+ err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
+ .page = page,
+ .offset = offset - map->m_la,
+ .end = end,
+ }), exclusive);
+ /* should allocate an additional short-lived page for bvset */
+ if (err == -EAGAIN && !fe->candidate_bvpage) {
+ fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
+ set_page_private(fe->candidate_bvpage,
+ Z_EROFS_SHORTLIVED_PAGE);
+ goto retry;
}
- if (err)
- goto err_out;
-
- index = page->index - (map->m_la >> PAGE_SHIFT);
-
- z_erofs_onlinepage_fixup(page, index, true);
+ if (err) {
+ DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
+ goto out;
+ }
+ z_erofs_onlinepage_split(page);
/* bump up the number of spiltted parts of a page */
++spiltted;
- /* also update nr_pages */
- fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1);
+ if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
+ fe->pcl->multibases = true;
+
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ fe->pcl->length == map->m_llen)
+ fe->pcl->partial = false;
+ if (fe->pcl->length < offset + end - map->m_la) {
+ fe->pcl->length = offset + end - map->m_la;
+ fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
next_part:
- /* can be used for verification */
+ /* shorten the remaining extent to update progress */
map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
end = cur;
if (end > 0)
goto repeat;
out:
+ if (err)
+ z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
__func__, page, spiltted, map->m_llen);
return err;
-
- /* if some error occurred while processing this page */
-err_out:
- SetPageError(page);
- goto out;
}
static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
unsigned int readahead_pages)
{
- /* auto: enable for readpage, disable for readahead */
+ /* auto: enable for read_folio, disable for readahead */
if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
!readahead_pages)
return true;
@@ -808,99 +861,137 @@ static bool z_erofs_page_is_invalidated(struct page *page)
return !page->mapping && !z_erofs_is_shortlived_page(page);
}
-static int z_erofs_decompress_pcluster(struct super_block *sb,
- struct z_erofs_pcluster *pcl,
- struct page **pagepool)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
- struct z_erofs_pagevec_ctor ctor;
- unsigned int i, inputsize, outputsize, llen, nr_pages;
- struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
- struct page **pages, **compressed_pages, *page;
-
- enum z_erofs_page_type page_type;
- bool overlapped, partial;
- struct z_erofs_collection *cl;
- int err;
-
- might_sleep();
- cl = z_erofs_primarycollection(pcl);
- DBG_BUGON(!READ_ONCE(cl->nr_pages));
+struct z_erofs_decompress_backend {
+ struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
+ struct super_block *sb;
+ struct z_erofs_pcluster *pcl;
- mutex_lock(&cl->lock);
- nr_pages = cl->nr_pages;
+ /* pages with the longest decompressed length for deduplication */
+ struct page **decompressed_pages;
+ /* pages to keep the compressed data */
+ struct page **compressed_pages;
- if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) {
- pages = pages_onstack;
- } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
- mutex_trylock(&z_pagemap_global_lock)) {
- pages = z_pagemap_global;
- } else {
- gfp_t gfp_flags = GFP_KERNEL;
+ struct list_head decompressed_secondary_bvecs;
+ struct page **pagepool;
+ unsigned int onstack_used, nr_pages;
+};
- if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
- gfp_flags |= __GFP_NOFAIL;
+struct z_erofs_bvec_item {
+ struct z_erofs_bvec bvec;
+ struct list_head list;
+};
- pages = kvmalloc_array(nr_pages, sizeof(struct page *),
- gfp_flags);
+static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+ struct z_erofs_bvec *bvec)
+{
+ struct z_erofs_bvec_item *item;
- /* fallback to global pagemap for the lowmem scenario */
- if (!pages) {
- mutex_lock(&z_pagemap_global_lock);
- pages = z_pagemap_global;
- }
- }
+ if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
+ unsigned int pgnr;
+ struct page *oldpage;
- for (i = 0; i < nr_pages; ++i)
- pages[i] = NULL;
+ pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+ oldpage = be->decompressed_pages[pgnr];
+ be->decompressed_pages[pgnr] = bvec->page;
- err = 0;
- z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
- cl->pagevec, 0);
+ if (!oldpage)
+ return;
+ }
- for (i = 0; i < cl->vcnt; ++i) {
- unsigned int pagenr;
+ /* (cold path) one pcluster is requested multiple times */
+ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
+ item->bvec = *bvec;
+ list_add(&item->list, &be->decompressed_secondary_bvecs);
+}
- page = z_erofs_pagevec_dequeue(&ctor, &page_type);
+static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ unsigned int off0 = be->pcl->pageofs_out;
+ struct list_head *p, *n;
+
+ list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
+ struct z_erofs_bvec_item *bvi;
+ unsigned int end, cur;
+ void *dst, *src;
+
+ bvi = container_of(p, struct z_erofs_bvec_item, list);
+ cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
+ end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
+ bvi->bvec.end);
+ dst = kmap_local_page(bvi->bvec.page);
+ while (cur < end) {
+ unsigned int pgnr, scur, len;
+
+ pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+
+ scur = bvi->bvec.offset + cur -
+ ((pgnr << PAGE_SHIFT) - off0);
+ len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
+ if (!be->decompressed_pages[pgnr]) {
+ err = -EFSCORRUPTED;
+ cur += len;
+ continue;
+ }
+ src = kmap_local_page(be->decompressed_pages[pgnr]);
+ memcpy(dst + cur, src + scur, len);
+ kunmap_local(src);
+ cur += len;
+ }
+ kunmap_local(dst);
+ if (err)
+ z_erofs_page_mark_eio(bvi->bvec.page);
+ z_erofs_onlinepage_endio(bvi->bvec.page);
+ list_del(p);
+ kfree(bvi);
+ }
+}
- /* all pages in pagevec ought to be valid */
- DBG_BUGON(!page);
- DBG_BUGON(z_erofs_page_is_invalidated(page));
+static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ struct z_erofs_bvec_iter biter;
+ struct page *old_bvpage;
+ int i;
- if (z_erofs_put_shortlivedpage(pagepool, page))
- continue;
+ z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
+ for (i = 0; i < pcl->vcnt; ++i) {
+ struct z_erofs_bvec bvec;
- if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
- pagenr = 0;
- else
- pagenr = z_erofs_onlinepage_index(page);
+ z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
- DBG_BUGON(pagenr >= nr_pages);
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
- /*
- * currently EROFS doesn't support multiref(dedup),
- * so here erroring out one multiref page.
- */
- if (pages[pagenr]) {
- DBG_BUGON(1);
- SetPageError(pages[pagenr]);
- z_erofs_onlinepage_endio(pages[pagenr]);
- err = -EFSCORRUPTED;
- }
- pages[pagenr] = page;
+ DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
+ z_erofs_do_decompressed_bvec(be, &bvec);
}
- z_erofs_pagevec_ctor_exit(&ctor, true);
- overlapped = false;
- compressed_pages = pcl->compressed_pages;
+ old_bvpage = z_erofs_bvec_iter_end(&biter);
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
+}
+static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
+ bool *overlapped)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ int i, err = 0;
+
+ *overlapped = false;
for (i = 0; i < pclusterpages; ++i) {
- unsigned int pagenr;
+ struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
+ struct page *page = bvec->page;
- page = compressed_pages[i];
- /* all compressed pages ought to be valid */
- DBG_BUGON(!page);
+ /* compressed pages ought to be present before decompressing */
+ if (!page) {
+ DBG_BUGON(1);
+ continue;
+ }
+ be->compressed_pages[i] = page;
if (z_erofs_is_inline_pcluster(pcl)) {
if (!PageUptodate(page))
@@ -910,139 +1001,159 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
- if (erofs_page_is_managed(sbi, page)) {
+ if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
if (!PageUptodate(page))
err = -EIO;
continue;
}
-
- /*
- * only if non-head page can be selected
- * for inplace decompression
- */
- pagenr = z_erofs_onlinepage_index(page);
-
- DBG_BUGON(pagenr >= nr_pages);
- if (pages[pagenr]) {
- DBG_BUGON(1);
- SetPageError(pages[pagenr]);
- z_erofs_onlinepage_endio(pages[pagenr]);
- err = -EFSCORRUPTED;
- }
- pages[pagenr] = page;
-
- overlapped = true;
- }
-
- /* PG_error needs checking for all non-managed pages */
- if (PageError(page)) {
- DBG_BUGON(PageUptodate(page));
- err = -EIO;
+ z_erofs_do_decompressed_bvec(be, bvec);
+ *overlapped = true;
}
}
if (err)
- goto out;
+ return err;
+ return 0;
+}
- llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
- if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
- outputsize = llen;
- partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
- } else {
- outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
- partial = true;
+static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ unsigned int i, inputsize;
+ int err2;
+ struct page *page;
+ bool overlapped;
+
+ mutex_lock(&pcl->lock);
+ be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
+
+ /* allocate (de)compressed page arrays if cannot be kept on stack */
+ be->decompressed_pages = NULL;
+ be->compressed_pages = NULL;
+ be->onstack_used = 0;
+ if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
+ be->decompressed_pages = be->onstack_pages;
+ be->onstack_used = be->nr_pages;
+ memset(be->decompressed_pages, 0,
+ sizeof(struct page *) * be->nr_pages);
}
+ if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
+ be->compressed_pages = be->onstack_pages + be->onstack_used;
+
+ if (!be->decompressed_pages)
+ be->decompressed_pages =
+ kvcalloc(be->nr_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+ if (!be->compressed_pages)
+ be->compressed_pages =
+ kvcalloc(pclusterpages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ z_erofs_parse_out_bvecs(be);
+ err2 = z_erofs_parse_in_bvecs(be, &overlapped);
+ if (err2)
+ err = err2;
+ if (err)
+ goto out;
+
if (z_erofs_is_inline_pcluster(pcl))
inputsize = pcl->tailpacking_size;
else
inputsize = pclusterpages * PAGE_SIZE;
err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
- .sb = sb,
- .in = compressed_pages,
- .out = pages,
+ .sb = be->sb,
+ .in = be->compressed_pages,
+ .out = be->decompressed_pages,
.pageofs_in = pcl->pageofs_in,
- .pageofs_out = cl->pageofs,
+ .pageofs_out = pcl->pageofs_out,
.inputsize = inputsize,
- .outputsize = outputsize,
+ .outputsize = pcl->length,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
- .partial_decoding = partial
- }, pagepool);
+ .partial_decoding = pcl->partial,
+ .fillgaps = pcl->multibases,
+ }, be->pagepool);
out:
/* must handle all compressed pages before actual file pages */
if (z_erofs_is_inline_pcluster(pcl)) {
- page = compressed_pages[0];
- WRITE_ONCE(compressed_pages[0], NULL);
+ page = pcl->compressed_bvecs[0].page;
+ WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
put_page(page);
} else {
for (i = 0; i < pclusterpages; ++i) {
- page = compressed_pages[i];
+ page = pcl->compressed_bvecs[i].page;
if (erofs_page_is_managed(sbi, page))
continue;
/* recycle all individual short-lived pages */
- (void)z_erofs_put_shortlivedpage(pagepool, page);
- WRITE_ONCE(compressed_pages[i], NULL);
+ (void)z_erofs_put_shortlivedpage(be->pagepool, page);
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
}
+ if (be->compressed_pages < be->onstack_pages ||
+ be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
+ kvfree(be->compressed_pages);
+ z_erofs_fill_other_copies(be, err);
- for (i = 0; i < nr_pages; ++i) {
- page = pages[i];
+ for (i = 0; i < be->nr_pages; ++i) {
+ page = be->decompressed_pages[i];
if (!page)
continue;
DBG_BUGON(z_erofs_page_is_invalidated(page));
/* recycle all individual short-lived pages */
- if (z_erofs_put_shortlivedpage(pagepool, page))
+ if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
-
- if (err < 0)
- SetPageError(page);
-
+ if (err)
+ z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
}
- if (pages == z_pagemap_global)
- mutex_unlock(&z_pagemap_global_lock);
- else if (pages != pages_onstack)
- kvfree(pages);
+ if (be->decompressed_pages != be->onstack_pages)
+ kvfree(be->decompressed_pages);
- cl->nr_pages = 0;
- cl->vcnt = 0;
+ pcl->length = 0;
+ pcl->partial = true;
+ pcl->multibases = false;
+ pcl->bvset.nextpage = NULL;
+ pcl->vcnt = 0;
- /* all cl locks MUST be taken before the following line */
+ /* pcluster lock MUST be taken before the following line */
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
-
- /* all cl locks SHOULD be released right now */
- mutex_unlock(&cl->lock);
-
- z_erofs_collection_put(cl);
+ mutex_unlock(&pcl->lock);
return err;
}
static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
struct page **pagepool)
{
+ struct z_erofs_decompress_backend be = {
+ .sb = io->sb,
+ .pagepool = pagepool,
+ .decompressed_secondary_bvecs =
+ LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ };
z_erofs_next_pcluster_t owned = io->head;
while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
- struct z_erofs_pcluster *pcl;
-
- /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+ /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
-
- /* no possible that 'owned' equals NULL */
+ /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
- pcl = container_of(owned, struct z_erofs_pcluster, next);
- owned = READ_ONCE(pcl->next);
+ be.pcl = container_of(owned, struct z_erofs_pcluster, next);
+ owned = READ_ONCE(be.pcl->next);
- z_erofs_decompress_pcluster(io->sb, pcl, pagepool);
+ z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
+ erofs_workgroup_put(&be.pcl->obj);
}
}
@@ -1066,12 +1177,8 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
/* wake up the caller thread for sync decompression */
if (sync) {
- unsigned long flags;
-
- spin_lock_irqsave(&io->u.wait.lock, flags);
if (!atomic_add_return(bios, &io->pending_bios))
- wake_up_locked(&io->u.wait);
- spin_unlock_irqrestore(&io->u.wait.lock, flags);
+ complete(&io->u.done);
return;
}
@@ -1104,7 +1211,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
int justfound;
repeat:
- page = READ_ONCE(pcl->compressed_pages[nr]);
+ page = READ_ONCE(pcl->compressed_bvecs[nr].page);
oldpage = page;
if (!page)
@@ -1120,7 +1227,7 @@ repeat:
* otherwise, it will go inplace I/O path instead.
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
- WRITE_ONCE(pcl->compressed_pages[nr], page);
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
set_page_private(page, 0);
tocache = true;
goto out_tocache;
@@ -1146,14 +1253,13 @@ repeat:
/* the page is still in manage cache */
if (page->mapping == mc) {
- WRITE_ONCE(pcl->compressed_pages[nr], page);
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
- ClearPageError(page);
if (!PagePrivate(page)) {
/*
* impossible to be !PagePrivate(page) for
* the current restriction as well if
- * the page is already in compressed_pages[].
+ * the page is already in compressed_bvecs[].
*/
DBG_BUGON(!justfound);
@@ -1182,7 +1288,8 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+ if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
+ oldpage, page)) {
erofs_pagepool_add(pagepool, page);
cond_resched();
goto repeat;
@@ -1217,8 +1324,9 @@ jobqueue_init(struct super_block *sb,
} else {
fg_out:
q = fgq;
- init_waitqueue_head(&fgq->u.wait);
+ init_completion(&fgq->u.done);
atomic_set(&fgq->pending_bios, 0);
+ q->eio = false;
}
q->sb = sb;
q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
@@ -1279,26 +1387,25 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
DBG_BUGON(PageUptodate(page));
DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (err)
- SetPageError(page);
-
if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
if (!err)
SetPageUptodate(page);
unlock_page(page);
}
}
+ if (err)
+ q->eio = true;
z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
bio_put(bio);
}
-static void z_erofs_submit_queue(struct super_block *sb,
- struct z_erofs_decompress_frontend *f,
+static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct super_block *sb = f->inode->i_sb;
+ struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
@@ -1308,6 +1415,8 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
+ /* initialize to 1 to make skip psi_memstall_leave unless needed */
+ unsigned long pflags = 1;
bi_private = jobqueueset_init(sb, q, fgq, force_fg);
qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
@@ -1350,17 +1459,22 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct page *page;
page = pickup_page_for_submission(pcl, i++, pagepool,
- MNGD_MAPPING(sbi));
+ mc);
if (!page)
continue;
if (bio && (cur != last_index + 1 ||
last_bdev != mdev.m_bdev)) {
submit_bio_retry:
+ if (!pflags)
+ psi_memstall_leave(&pflags);
submit_bio(bio);
bio = NULL;
}
+ if (unlikely(PageWorkingset(page)))
+ psi_memstall_enter(&pflags);
+
if (!bio) {
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
@@ -1388,8 +1502,11 @@ submit_bio_retry:
move_to_bypass_jobqueue(pcl, qtail, owned_head);
} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
- if (bio)
+ if (bio) {
+ if (!pflags)
+ psi_memstall_leave(&pflags);
submit_bio(bio);
+ }
/*
* although background is preferred, no one is pending for submission.
@@ -1402,15 +1519,14 @@ submit_bio_retry:
z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
}
-static void z_erofs_runqueue(struct super_block *sb,
- struct z_erofs_decompress_frontend *f,
+static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
struct page **pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
+ z_erofs_submit_queue(f, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1419,8 +1535,7 @@ static void z_erofs_runqueue(struct super_block *sb,
return;
/* wait until all bios are completed */
- io_wait_event(io[JQ_SUBMIT].u.wait,
- !atomic_read(&io[JQ_SUBMIT].pending_bios));
+ wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
@@ -1470,30 +1585,28 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
struct page *page;
page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
- if (!page)
- goto skip;
-
- if (PageUptodate(page)) {
- unlock_page(page);
+ if (page) {
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ } else {
+ err = z_erofs_do_read_page(f, page, pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readmore error at page %lu @ nid %llu",
+ index, EROFS_I(inode)->nid);
+ }
put_page(page);
- goto skip;
}
- err = z_erofs_do_read_page(f, page, pagepool);
- if (err)
- erofs_err(inode->i_sb,
- "readmore error at page %lu @ nid %llu",
- index, EROFS_I(inode)->nid);
- put_page(page);
-skip:
if (cur < PAGE_SIZE)
break;
cur = (index << PAGE_SHIFT) - 1;
}
}
-static int z_erofs_readpage(struct file *file, struct page *page)
+static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *const inode = page->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
@@ -1511,7 +1624,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
(void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ z_erofs_runqueue(&f, &pagepool,
z_erofs_get_sync_decompress_policy(sbi, 0));
if (err)
@@ -1560,13 +1673,13 @@ static void z_erofs_readahead(struct readahead_control *rac)
z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
(void)z_erofs_collector_end(&f);
- z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ z_erofs_runqueue(&f, &pagepool,
z_erofs_get_sync_decompress_policy(sbi, nr_pages));
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&pagepool);
}
const struct address_space_operations z_erofs_aops = {
- .readpage = z_erofs_readpage,
+ .read_folio = z_erofs_read_folio,
.readahead = z_erofs_readahead,
};
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index e043216b545f..e7f04c4fbb81 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -7,10 +7,31 @@
#define __EROFS_FS_ZDATA_H
#include "internal.h"
-#include "zpvec.h"
+#include "tagptr.h"
#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
-#define Z_EROFS_NR_INLINE_PAGEVECS 3
+#define Z_EROFS_INLINE_BVECS 2
+
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+
+struct z_erofs_bvec {
+ struct page *page;
+ int offset;
+ unsigned int end;
+};
+
+#define __Z_EROFS_BVSET(name, total) \
+struct name { \
+ /* point to the next page which contains the following bvecs */ \
+ struct page *nextpage; \
+ struct z_erofs_bvec bvec[total]; \
+}
+__Z_EROFS_BVSET(z_erofs_bvset,);
+__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
/*
* Structure fields follow one of the following exclusion rules.
@@ -18,52 +39,36 @@
* I: Modifiable by initialization/destruction paths and read-only
* for everyone else;
*
- * L: Field should be protected by pageset lock;
+ * L: Field should be protected by the pcluster lock;
*
* A: Field should be accessed / updated in atomic for parallelized code.
*/
-struct z_erofs_collection {
+struct z_erofs_pcluster {
+ struct erofs_workgroup obj;
struct mutex lock;
- /* I: page offset of start position of decompression */
- unsigned short pageofs;
+ /* A: point to next chained pcluster or TAILs */
+ z_erofs_next_pcluster_t next;
- /* L: maximum relative page index in pagevec[] */
- unsigned short nr_pages;
+ /* L: the maximum decompression size of this round */
+ unsigned int length;
- /* L: total number of pages in pagevec[] */
+ /* L: total number of bvecs */
unsigned int vcnt;
+ /* I: page offset of start position of decompression */
+ unsigned short pageofs_out;
+
+ /* I: page offset of inline compressed data */
+ unsigned short pageofs_in;
+
union {
- /* L: inline a certain number of pagevecs for bootstrap */
- erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
+ /* L: inline a certain number of bvec for bootstrap */
+ struct z_erofs_bvset_inline bvset;
/* I: can be used to free the pcluster by RCU. */
struct rcu_head rcu;
};
-};
-
-#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
-#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
-
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
-
-struct z_erofs_pcluster {
- struct erofs_workgroup obj;
- struct z_erofs_collection primary_collection;
-
- /* A: point to next chained pcluster or TAILs */
- z_erofs_next_pcluster_t next;
-
- /* A: lower limit of decompressed length and if full length or not */
- unsigned int length;
-
- /* I: page offset of inline compressed data */
- unsigned short pageofs_in;
union {
/* I: physical cluster size in pages */
@@ -76,11 +81,15 @@ struct z_erofs_pcluster {
/* I: compression algorithm format */
unsigned char algorithmformat;
- /* A: compressed pages (can be cached or inplaced pages) */
- struct page *compressed_pages[];
-};
+ /* L: whether partial decompression or not */
+ bool partial;
-#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
+ /* L: indicate several pageofs_outs or not */
+ bool multibases;
+
+ /* A: compressed bvecs (can be cached or inplaced pages) */
+ struct z_erofs_bvec compressed_bvecs[];
+};
/* let's avoid the valid 32-bit kernel addresses */
@@ -97,9 +106,11 @@ struct z_erofs_decompressqueue {
z_erofs_next_pcluster_t head;
union {
- wait_queue_head_t wait;
+ struct completion done;
struct work_struct work;
} u;
+
+ bool eio;
};
static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
@@ -114,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
return pcl->pclusterpages;
}
-#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
-#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
-#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
-
/*
- * waiters (aka. ongoing_packs): # to unlock the page
- * sub-index: 0 - for partial page, >= 1 full page sub-index
+ * bit 31: I/O error occurred on this page
+ * bit 0 - 30: remaining parts to complete this page
*/
-typedef atomic_t z_erofs_onlinepage_t;
-
-/* type punning */
-union z_erofs_onlinepage_converter {
- z_erofs_onlinepage_t *o;
- unsigned long *v;
-};
-
-static inline unsigned int z_erofs_onlinepage_index(struct page *page)
-{
- union z_erofs_onlinepage_converter u;
-
- DBG_BUGON(!PagePrivate(page));
- u.v = &page_private(page);
-
- return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
-}
+#define Z_EROFS_PAGE_EIO (1 << 31)
static inline void z_erofs_onlinepage_init(struct page *page)
{
union {
- z_erofs_onlinepage_t o;
+ atomic_t o;
unsigned long v;
- /* keep from being unlocked in advance */
} u = { .o = ATOMIC_INIT(1) };
set_page_private(page, u.v);
@@ -153,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page)
SetPagePrivate(page);
}
-static inline void z_erofs_onlinepage_fixup(struct page *page,
- uintptr_t index, bool down)
+static inline void z_erofs_onlinepage_split(struct page *page)
{
- union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
- int orig, orig_index, val;
-
-repeat:
- orig = atomic_read(u.o);
- orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
- if (orig_index) {
- if (!index)
- return;
+ atomic_inc((atomic_t *)&page->private);
+}
- DBG_BUGON(orig_index != index);
- }
+static inline void z_erofs_page_mark_eio(struct page *page)
+{
+ int orig;
- val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
- ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
- if (atomic_cmpxchg(u.o, orig, val) != orig)
- goto repeat;
+ do {
+ orig = atomic_read((atomic_t *)&page->private);
+ } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
+ orig | Z_EROFS_PAGE_EIO) != orig);
}
static inline void z_erofs_onlinepage_endio(struct page *page)
{
- union z_erofs_onlinepage_converter u;
unsigned int v;
DBG_BUGON(!PagePrivate(page));
- u.v = &page_private(page);
-
- v = atomic_dec_return(u.o);
- if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+ v = atomic_dec_return((atomic_t *)&page->private);
+ if (!(v & ~Z_EROFS_PAGE_EIO)) {
set_page_private(page, 0);
ClearPagePrivate(page);
- if (!PageError(page))
+ if (!(v & Z_EROFS_PAGE_EIO))
SetPageUptodate(page);
unlock_page(page);
}
- erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o));
}
-#define Z_EROFS_VMAP_ONSTACK_PAGES \
- min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
-#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
+#define Z_EROFS_ONSTACK_PAGES 32
#endif
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 572f0b8151ba..44c27ef39c43 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -17,7 +17,7 @@ int z_erofs_fill_inode(struct inode *inode)
struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
if (!erofs_sb_has_big_pcluster(sbi) &&
- !erofs_sb_has_ztailpacking(sbi) &&
+ !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) &&
vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
vi->z_advise = 0;
vi->z_algorithmtype[0] = 0;
@@ -55,10 +55,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
- DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
- !erofs_sb_has_ztailpacking(EROFS_SB(sb)) &&
- vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
-
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, 8);
kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
@@ -69,6 +65,16 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
}
h = kaddr + erofs_blkoff(pos);
+ /*
+ * if the highest bit of the 8-byte map header is set, the whole file
+ * is stored in the packed inode. The rest bits keeps z_fragmentoff.
+ */
+ if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) {
+ vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
+ vi->z_tailextent_headlcn = 0;
+ goto unmap_done;
+ }
vi->z_advise = le16_to_cpu(h->h_advise);
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
@@ -123,6 +129,20 @@ unmap_done:
if (err < 0)
goto out_unlock;
}
+
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
+ !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
+ struct erofs_map_blocks map = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
+
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ err = z_erofs_do_map_blocks(inode, &map,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&map.buf);
+ if (err < 0)
+ goto out_unlock;
+ }
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
@@ -141,22 +161,11 @@ struct z_erofs_maprecorder {
u8 type, headtype;
u16 clusterofs;
u16 delta[2];
- erofs_blk_t pblk, compressedlcs;
+ erofs_blk_t pblk, compressedblks;
erofs_off_t nextpackoff;
+ bool partialref;
};
-static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
- erofs_blk_t eblk)
-{
- struct super_block *const sb = m->inode->i_sb;
-
- m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk,
- EROFS_KMAP_ATOMIC);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
- return 0;
-}
-
static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned long lcn)
{
@@ -169,11 +178,11 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
lcn * sizeof(struct z_erofs_vle_decompressed_index);
struct z_erofs_vle_decompressed_index *di;
unsigned int advise, type;
- int err;
- err = z_erofs_reload_indexes(m, erofs_blknr(pos));
- if (err)
- return err;
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index);
m->lcn = lcn;
@@ -192,7 +201,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedlcs = m->delta[0] &
+ m->compressedblks = m->delta[0] &
~Z_EROFS_VLE_DI_D0_CBLKCNT;
m->delta[0] = 1;
}
@@ -201,6 +210,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ if (advise & Z_EROFS_VLE_DI_PARTIAL_REF)
+ m->partialref = true;
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
@@ -293,7 +304,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
m->delta[0] = 1;
return 0;
} else if (i + 1 != (int)vcnt) {
@@ -370,7 +381,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int compacted_4b_initial, compacted_2b;
unsigned int amortizedshift;
erofs_off_t pos;
- int err;
if (lclusterbits != 12)
return -EOPNOTSUPP;
@@ -407,9 +417,10 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
amortizedshift = 2;
out:
pos += lcn * (1 << amortizedshift);
- err = z_erofs_reload_indexes(m, erofs_blknr(pos));
- if (err)
- return err;
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
}
@@ -497,7 +508,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return 0;
}
lcn = m->lcn + 1;
- if (m->compressedlcs)
+ if (m->compressedblks)
goto out;
err = z_erofs_load_cluster_from_disk(m, lcn, false);
@@ -506,7 +517,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
/*
* If the 1st NONHEAD lcluster has already been handled initially w/o
- * valid compressedlcs, which means at least it mustn't be CBLKCNT, or
+ * valid compressedblks, which means at least it mustn't be CBLKCNT, or
* an internal implemenatation error is detected.
*
* The following code can also handle it properly anyway, but let's
@@ -523,12 +534,12 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
* if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
* rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
*/
- m->compressedlcs = 1;
+ m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
if (m->delta[0] != 1)
goto err_bonus_cblkcnt;
- if (m->compressedlcs)
+ if (m->compressedblks)
break;
fallthrough;
default:
@@ -539,7 +550,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
out:
- map->m_plen = (u64)m->compressedlcs << lclusterbits;
+ map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE;
return 0;
err_bonus_cblkcnt:
erofs_err(m->inode->i_sb,
@@ -598,6 +609,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
{
struct erofs_inode *const vi = EROFS_I(inode);
bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
@@ -663,15 +675,23 @@ static int z_erofs_do_map_blocks(struct inode *inode,
err = -EOPNOTSUPP;
goto unmap_out;
}
-
+ if (m.partialref)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
map->m_llen = end - map->m_la;
- if (flags & EROFS_GET_BLOCKS_FINDTAIL)
+ if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
vi->z_tailextent_headlcn = m.lcn;
+ /* for non-compact indexes, fragmentoff is 64 bits */
+ if (fragment &&
+ vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+ vi->z_fragmentoff |= (u64)m.pblk << 32;
+ }
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_META;
map->m_pa = vi->z_idataoff;
map->m_plen = vi->z_idata_size;
+ } else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
+ map->m_flags |= EROFS_MAP_FRAGMENT;
} else {
map->m_pa = blknr_to_addr(m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
@@ -679,12 +699,18 @@ static int z_erofs_do_map_blocks(struct inode *inode,
goto out;
}
- if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
- map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
- else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
map->m_algorithmformat = vi->z_algorithmtype[1];
- else
+ } else {
map->m_algorithmformat = vi->z_algorithmtype[0];
+ }
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
@@ -705,10 +731,10 @@ out:
return err;
}
-int z_erofs_map_blocks_iter(struct inode *inode,
- struct erofs_map_blocks *map,
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags)
{
+ struct erofs_inode *const vi = EROFS_I(inode);
int err = 0;
trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -725,6 +751,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
if (err)
goto out;
+ if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
+ EROFS_MAP_FRAGMENT;
+ goto out;
+ }
+
err = z_erofs_do_map_blocks(inode, map, flags);
out:
trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
@@ -751,7 +786,8 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_pa;
+ iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ IOMAP_NULL_ADDR : map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
deleted file mode 100644
index b05464f4a808..000000000000
--- a/fs/erofs/zpvec.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2018 HUAWEI, Inc.
- * https://www.huawei.com/
- */
-#ifndef __EROFS_FS_ZPVEC_H
-#define __EROFS_FS_ZPVEC_H
-
-#include "tagptr.h"
-
-/* page type in pagevec for decompress subsystem */
-enum z_erofs_page_type {
- /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
- Z_EROFS_PAGE_TYPE_EXCLUSIVE,
-
- Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
-
- Z_EROFS_VLE_PAGE_TYPE_HEAD,
- Z_EROFS_VLE_PAGE_TYPE_MAX
-};
-
-extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
- __bad_page_type_exclusive(void);
-
-/* pagevec tagged pointer */
-typedef tagptr2_t erofs_vtptr_t;
-
-/* pagevec collector */
-struct z_erofs_pagevec_ctor {
- struct page *curr, *next;
- erofs_vtptr_t *pages;
-
- unsigned int nr, index;
-};
-
-static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
- bool atomic)
-{
- if (!ctor->curr)
- return;
-
- if (atomic)
- kunmap_atomic(ctor->pages);
- else
- kunmap(ctor->curr);
-}
-
-static inline struct page *
-z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
- unsigned int nr)
-{
- unsigned int index;
-
- /* keep away from occupied pages */
- if (ctor->next)
- return ctor->next;
-
- for (index = 0; index < nr; ++index) {
- const erofs_vtptr_t t = ctor->pages[index];
- const unsigned int tags = tagptr_unfold_tags(t);
-
- if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
- return tagptr_unfold_ptr(t);
- }
- DBG_BUGON(nr >= ctor->nr);
- return NULL;
-}
-
-static inline void
-z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
- bool atomic)
-{
- struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
-
- z_erofs_pagevec_ctor_exit(ctor, atomic);
-
- ctor->curr = next;
- ctor->next = NULL;
- ctor->pages = atomic ?
- kmap_atomic(ctor->curr) : kmap(ctor->curr);
-
- ctor->nr = PAGE_SIZE / sizeof(struct page *);
- ctor->index = 0;
-}
-
-static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
- unsigned int nr,
- erofs_vtptr_t *pages,
- unsigned int i)
-{
- ctor->nr = nr;
- ctor->curr = ctor->next = NULL;
- ctor->pages = pages;
-
- if (i >= nr) {
- i -= nr;
- z_erofs_pagevec_ctor_pagedown(ctor, false);
- while (i > ctor->nr) {
- i -= ctor->nr;
- z_erofs_pagevec_ctor_pagedown(ctor, false);
- }
- }
- ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
- ctor->index = i;
-}
-
-static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
- struct page *page,
- enum z_erofs_page_type type,
- bool pvec_safereuse)
-{
- if (!ctor->next) {
- /* some pages cannot be reused as pvec safely without I/O */
- if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse)
- type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED;
-
- if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- ctor->index + 1 == ctor->nr)
- return false;
- }
-
- if (ctor->index >= ctor->nr)
- z_erofs_pagevec_ctor_pagedown(ctor, false);
-
- /* exclusive page type must be 0 */
- if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
- __bad_page_type_exclusive();
-
- /* should remind that collector->next never equal to 1, 2 */
- if (type == (uintptr_t)ctor->next) {
- ctor->next = page;
- }
- ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
- return true;
-}
-
-static inline struct page *
-z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
- enum z_erofs_page_type *type)
-{
- erofs_vtptr_t t;
-
- if (ctor->index >= ctor->nr) {
- DBG_BUGON(!ctor->next);
- z_erofs_pagevec_ctor_pagedown(ctor, true);
- }
-
- t = ctor->pages[ctor->index];
-
- *type = tagptr_unfold_tags(t);
-
- /* should remind that collector->next never equal to 1, 2 */
- if (*type == (uintptr_t)ctor->next)
- ctor->next = tagptr_unfold_ptr(t);
-
- ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
- return tagptr_unfold_ptr(t);
-}
-#endif
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3627dd7d25db..c0ffee99ad23 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -69,17 +69,17 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(current->in_eventfd_signal))
+ if (WARN_ON_ONCE(current->in_eventfd))
return 0;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- current->in_eventfd_signal = 1;
+ current->in_eventfd = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- current->in_eventfd_signal = 0;
+ current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
@@ -253,8 +253,10 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
__set_current_state(TASK_RUNNING);
}
eventfd_ctx_do_read(ctx, &ucnt);
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
+ current->in_eventfd = 0;
spin_unlock_irq(&ctx->wqh.lock);
if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
return -EFAULT;
@@ -301,8 +303,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
}
if (likely(res > 0)) {
ctx->count += ucnt;
+ current->in_eventfd = 1;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ current->in_eventfd = 0;
}
spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e2daa940ebce..8b56b94e2f56 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1747,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
return to;
}
+/*
+ * autoremove_wake_function, but remove even on failure to wake up, because we
+ * know that default_wake_function/ttwu will only fail if the thread is already
+ * woken, and in that case the ep_poll loop will remove the entry anyways, not
+ * try to reuse it.
+ */
+static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int sync, void *key)
+{
+ int ret = default_wake_function(wq_entry, mode, sync, key);
+
+ list_del_init(&wq_entry->entry);
+ return ret;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -1828,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
+ *
+ * In fact, we now use an even more aggressive function that
+ * unconditionally removes, because we don't reuse the wait
+ * entry between loop iterations. This lets us also avoid the
+ * performance issue if a process is killed, causing all of its
+ * threads to wake up without being removed normally.
*/
init_wait(&wait);
+ wait.func = ep_autoremove_wake_function;
write_lock_irq(&ep->lock);
/*
diff --git a/fs/exec.c b/fs/exec.c
index 4ba780cae64c..de084e485462 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -583,11 +583,11 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
if (kmapped_page) {
flush_dcache_page(kmapped_page);
- kunmap(kmapped_page);
+ kunmap_local(kaddr);
put_arg_page(kmapped_page);
}
kmapped_page = page;
- kaddr = kmap(kmapped_page);
+ kaddr = kmap_local_page(kmapped_page);
kpos = pos & PAGE_MASK;
flush_arg_page(bprm, kpos, kmapped_page);
}
@@ -601,7 +601,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
out:
if (kmapped_page) {
flush_dcache_page(kmapped_page);
- kunmap(kmapped_page);
+ kunmap_local(kaddr);
put_arg_page(kmapped_page);
}
return ret;
@@ -630,7 +630,6 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
unsigned int bytes_to_copy = min_t(unsigned int, len,
min_not_zero(offset_in_page(pos), PAGE_SIZE));
struct page *page;
- char *kaddr;
pos -= bytes_to_copy;
arg -= bytes_to_copy;
@@ -639,11 +638,8 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
page = get_arg_page(bprm, pos, 1);
if (!page)
return -E2BIG;
- kaddr = kmap_atomic(page);
flush_arg_page(bprm, pos & PAGE_MASK, page);
- memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
+ memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy);
put_arg_page(page);
}
@@ -758,6 +754,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
unsigned long stack_size;
unsigned long stack_expand;
unsigned long rlim_stack;
+ struct mmu_gather tlb;
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size */
@@ -812,8 +809,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
vm_flags |= mm->def_flags;
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
- ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+ tlb_gather_mmu(&tlb, mm);
+ ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
vm_flags);
+ tlb_finish_mmu(&tlb);
+
if (ret)
goto out_unlock;
BUG_ON(prev != vma);
@@ -879,11 +879,11 @@ int transfer_args_to_stack(struct linux_binprm *bprm,
for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
- char *src = kmap(bprm->page[index]) + offset;
+ char *src = kmap_local_page(bprm->page[index]) + offset;
sp -= PAGE_SIZE - offset;
if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
ret = -EFAULT;
- kunmap(bprm->page[index]);
+ kunmap_local(src);
if (ret)
goto out;
}
@@ -957,8 +957,7 @@ struct file *open_exec(const char *name)
}
EXPORT_SYMBOL(open_exec);
-#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
- defined(CONFIG_BINFMT_ELF_FDPIC)
+#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
@@ -1145,7 +1144,7 @@ static int de_thread(struct task_struct *tsk)
/*
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
- * the tracer wont't block again waiting for this thread.
+ * the tracer won't block again waiting for this thread.
*/
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
@@ -1297,7 +1296,10 @@ int begin_new_exec(struct linux_binprm * bprm)
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
- exit_itimers(me->signal);
+ spin_lock_irq(&me->sighand->siglock);
+ posix_cpu_timers_exit(me);
+ spin_unlock_irq(&me->sighand->siglock);
+ exit_itimers(me);
flush_itimer_signals();
#endif
@@ -1308,9 +1310,7 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- if (me->flags & PF_KTHREAD)
- free_kthread_struct(me);
- me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+ me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
me->personality &= ~bprm->per_clear;
@@ -1587,7 +1587,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
/* Handle suid and sgid on files */
struct user_namespace *mnt_userns;
- struct inode *inode;
+ struct inode *inode = file_inode(file);
unsigned int mode;
kuid_t uid;
kgid_t gid;
@@ -1598,7 +1598,6 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
if (task_no_new_privs(current))
return;
- inode = file->f_path.dentry->d_inode;
mode = READ_ONCE(inode->i_mode);
if (!(mode & (S_ISUID|S_ISGID)))
return;
@@ -1678,13 +1677,13 @@ int remove_arg_zero(struct linux_binprm *bprm)
ret = -EFAULT;
goto out;
}
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
for (; offset < PAGE_SIZE && kaddr[offset];
offset++, bprm->p++)
;
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
put_arg_page(page);
} while (offset == PAGE_SIZE);
@@ -1955,6 +1954,10 @@ int kernel_execve(const char *kernel_filename,
int fd = AT_FDCWD;
int retval;
+ /* It is non-sense for kernel threads to call execve */
+ if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ return -EINVAL;
+
filename = getname_kernel(kernel_filename);
if (IS_ERR(filename))
return PTR_ERR(filename);
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 03f142307174..9f42f25fab92 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -148,7 +148,9 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ if (!is_valid_cluster(sbi, clu))
+ return -EINVAL;
+
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
@@ -166,7 +168,9 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_mount_options *opts = &sbi->options;
- WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ if (!is_valid_cluster(sbi, clu))
+ return;
+
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index a27b55ec060a..0fc08fdcba73 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -212,9 +212,9 @@ static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb)
/* skip iterating emit_dots when dir is empty */
#define ITER_POS_FILLED_DOTS (2)
-static int exfat_iterate(struct file *filp, struct dir_context *ctx)
+static int exfat_iterate(struct file *file, struct dir_context *ctx)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct inode *tmp;
struct exfat_dir_entry de;
@@ -228,7 +228,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx)
mutex_lock(&EXFAT_SB(sb)->s_lock);
cpos = ctx->pos;
- if (!dir_emit_dots(filp, ctx))
+ if (!dir_emit_dots(file, ctx))
goto unlock;
if (ctx->pos == ITER_POS_FILLED_DOTS) {
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index c6800b880920..a8f8eee4937c 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -27,9 +27,9 @@ enum exfat_error_mode {
* exfat nls lossy flag
*/
enum {
- NLS_NAME_NO_LOSSY, /* no lossy */
- NLS_NAME_LOSSY, /* just detected incorrect filename(s) */
- NLS_NAME_OVERLEN, /* the length is over than its limit */
+ NLS_NAME_NO_LOSSY = 0, /* no lossy */
+ NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */
+ NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */
};
#define EXFAT_HASH_BITS 8
@@ -203,6 +203,7 @@ struct exfat_mount_options {
/* on error: continue, panic, remount-ro */
enum exfat_error_mode errors;
unsigned utf8:1, /* Use of UTF-8 character set */
+ sys_tz:1, /* Use local timezone */
discard:1, /* Issue discard requests on deletions */
keep_last_dots:1; /* Keep trailing periods in paths */
int time_offset; /* Offset of timestamps from UTC (in minutes) */
@@ -381,6 +382,12 @@ static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
EXFAT_RESERVED_CLUSTERS;
}
+static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
+ unsigned int clus)
+{
+ return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
+}
+
/* super.c */
int exfat_set_volume_dirty(struct super_block *sb);
int exfat_clear_volume_dirty(struct super_block *sb);
@@ -476,6 +483,7 @@ struct inode *exfat_build_inode(struct super_block *sb,
void exfat_hash_inode(struct inode *inode, loff_t i_pos);
void exfat_unhash_inode(struct inode *inode);
struct inode *exfat_iget(struct super_block *sb, loff_t i_pos);
+int __exfat_write_inode(struct inode *inode, int sync);
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc);
void exfat_evict_inode(struct inode *inode);
int exfat_block_truncate_page(struct inode *inode, loff_t from);
@@ -501,14 +509,16 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
#define exfat_fs_error_ratelimit(sb, fmt, args...) \
__exfat_fs_error(sb, __ratelimit(&EXFAT_SB(sb)->ratelimit), \
fmt, ## args)
-void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...)
- __printf(3, 4) __cold;
+
+/* expand to pr_*() with prefix */
#define exfat_err(sb, fmt, ...) \
- exfat_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+ pr_err("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
#define exfat_warn(sb, fmt, ...) \
- exfat_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+ pr_warn("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
#define exfat_info(sb, fmt, ...) \
- exfat_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)
+ pr_info("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
+#define exfat_debug(sb, fmt, ...) \
+ pr_debug("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__)
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 tz, __le16 time, __le16 date, u8 time_cs);
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index a3464e56a7e1..41ae4cce1f42 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <asm/unaligned.h>
#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
@@ -81,12 +82,6 @@ int exfat_ent_set(struct super_block *sb, unsigned int loc,
return 0;
}
-static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
- unsigned int clus)
-{
- return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
-}
-
int exfat_ent_get(struct super_block *sb, unsigned int loc,
unsigned int *content)
{
@@ -274,10 +269,8 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
{
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct buffer_head *bhs[MAX_BUF_PER_PAGE];
- int nr_bhs = MAX_BUF_PER_PAGE;
- sector_t blknr, last_blknr;
- int err, i, n;
+ struct buffer_head *bh;
+ sector_t blknr, last_blknr, i;
blknr = exfat_cluster_to_sector(sbi, clu);
last_blknr = blknr + sbi->sect_per_clus;
@@ -291,30 +284,23 @@ int exfat_zeroed_cluster(struct inode *dir, unsigned int clu)
}
/* Zeroing the unused blocks on this cluster */
- while (blknr < last_blknr) {
- for (n = 0; n < nr_bhs && blknr < last_blknr; n++, blknr++) {
- bhs[n] = sb_getblk(sb, blknr);
- if (!bhs[n]) {
- err = -ENOMEM;
- goto release_bhs;
- }
- memset(bhs[n]->b_data, 0, sb->s_blocksize);
- }
-
- err = exfat_update_bhs(bhs, n, IS_DIRSYNC(dir));
- if (err)
- goto release_bhs;
+ for (i = blknr; i < last_blknr; i++) {
+ bh = sb_getblk(sb, i);
+ if (!bh)
+ return -ENOMEM;
- for (i = 0; i < n; i++)
- brelse(bhs[i]);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ brelse(bh);
}
- return 0;
-release_bhs:
- exfat_err(sb, "failed zeroed sect %llu\n", (unsigned long long)blknr);
- for (i = 0; i < n; i++)
- bforget(bhs[i]);
- return err;
+ if (IS_DIRSYNC(dir))
+ return sync_blockdev_range(sb->s_bdev,
+ EXFAT_BLK_TO_B(blknr, sb),
+ EXFAT_BLK_TO_B(last_blknr, sb) - 1);
+
+ return 0;
}
int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
@@ -344,7 +330,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
/* find new cluster */
if (hint_clu == EXFAT_EOF_CLUSTER) {
if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) {
- exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)\n",
+ exfat_err(sb, "sbi->clu_srch_ptr is invalid (%u)",
sbi->clu_srch_ptr);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 2f5130059236..4e0793f35e8f 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -101,7 +101,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0;
/* check if the given file ID is opened */
if (ei->type != TYPE_FILE && ei->type != TYPE_DIR)
@@ -149,50 +148,19 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
if (ei->type == TYPE_FILE)
ei->attr |= ATTR_ARCHIVE;
- /* update the directory entry */
- if (!evict) {
- struct timespec64 ts;
- struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es;
- int err;
-
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
- ES_ALL_ENTRIES);
- if (!es)
- return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
-
- ts = current_time(inode);
- exfat_set_entry_time(sbi, &ts,
- &ep->dentry.file.modify_tz,
- &ep->dentry.file.modify_time,
- &ep->dentry.file.modify_date,
- &ep->dentry.file.modify_time_cs);
- ep->dentry.file.attr = cpu_to_le16(ei->attr);
-
- /* File size should be zero if there is no cluster allocated */
- if (ei->start_clu == EXFAT_EOF_CLUSTER) {
- ep2->dentry.stream.valid_size = 0;
- ep2->dentry.stream.size = 0;
- } else {
- ep2->dentry.stream.valid_size = cpu_to_le64(new_size);
- ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
- }
-
- if (new_size == 0) {
- /* Any directory can not be truncated to zero */
- WARN_ON(ei->type != TYPE_FILE);
-
- ep2->dentry.stream.flags = ALLOC_FAT_CHAIN;
- ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
- }
-
- exfat_update_dir_chksum_with_entry_set(es);
- err = exfat_free_dentry_set(es, inode_needs_sync(inode));
- if (err)
- return err;
- }
+ /*
+ * update the directory entry
+ *
+ * If the directory entry is updated by mark_inode_dirty(), the
+ * directory entry will be written after a writeback cycle of
+ * updating the bitmap/FAT, which may result in clusters being
+ * freed but referenced by the directory entry in the event of a
+ * sudden power failure.
+ * __exfat_write_inode() is called for directory entry, bitmap
+ * and FAT to be written in a same writeback.
+ */
+ if (__exfat_write_inode(inode, inode_needs_sync(inode)))
+ return -EIO;
/* cut off from the FAT chain */
if (ei->flags == ALLOC_FAT_CHAIN && last_clu != EXFAT_FREE_CLUSTER &&
@@ -243,12 +211,6 @@ void exfat_truncate(struct inode *inode, loff_t size)
if (err)
goto write_size;
- inode->i_ctime = inode->i_mtime = current_time(inode);
- if (IS_DIRSYNC(inode))
- exfat_sync_inode(inode);
- else
- mark_inode_dirty(inode);
-
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
inode->i_blkbits;
write_size:
@@ -330,6 +292,12 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
attr->ia_valid &= ~ATTR_MODE;
}
+ if (attr->ia_valid & ATTR_SIZE)
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+
+ setattr_copy(&init_user_ns, inode, attr);
+ exfat_truncate_atime(&inode->i_atime);
+
if (attr->ia_valid & ATTR_SIZE) {
error = exfat_block_truncate_page(inode, attr->ia_size);
if (error)
@@ -337,13 +305,15 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
down_write(&EXFAT_I(inode)->truncate_lock);
truncate_setsize(inode, attr->ia_size);
+
+ /*
+ * __exfat_write_inode() is called from exfat_truncate(), inode
+ * is already written by it, so mark_inode_dirty() is unneeded.
+ */
exfat_truncate(inode, attr->ia_size);
up_write(&EXFAT_I(inode)->truncate_lock);
- }
-
- setattr_copy(&init_user_ns, inode, attr);
- exfat_truncate_atime(&inode->i_atime);
- mark_inode_dirty(inode);
+ } else
+ mark_inode_dirty(inode);
out:
return error;
@@ -351,21 +321,20 @@ out:
static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
{
- struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(inode->i_sb->s_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range)))
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(inode->i_sb->s_bdev));
ret = exfat_trim_fs(inode, &range);
if (ret < 0)
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index fc0ea1684880..a795437b86d0 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -17,7 +17,7 @@
#include "exfat_raw.h"
#include "exfat_fs.h"
-static int __exfat_write_inode(struct inode *inode, int sync)
+int __exfat_write_inode(struct inode *inode, int sync)
{
unsigned long long on_disk_size;
struct exfat_dentry *ep, *ep2;
@@ -75,6 +75,13 @@ static int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size);
ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
+ if (on_disk_size) {
+ ep2->dentry.stream.flags = ei->flags;
+ ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu);
+ } else {
+ ep2->dentry.stream.flags = ALLOC_FAT_CHAIN;
+ ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
+ }
exfat_update_dir_chksum_with_entry_set(es);
return exfat_free_dentry_set(es, sync);
@@ -105,7 +112,7 @@ void exfat_sync_inode(struct inode *inode)
static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
unsigned int *clu, int create)
{
- int ret, modified = false;
+ int ret;
unsigned int last_clu;
struct exfat_chain new_clu;
struct super_block *sb = inode->i_sb;
@@ -196,7 +203,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
if (new_clu.flags == ALLOC_FAT_CHAIN)
ei->flags = ALLOC_FAT_CHAIN;
ei->start_clu = new_clu.dir;
- modified = true;
} else {
if (new_clu.flags != ei->flags) {
/* no-fat-chain bit is disabled,
@@ -206,7 +212,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
exfat_chain_cont_cluster(sb, ei->start_clu,
num_clusters);
ei->flags = ALLOC_FAT_CHAIN;
- modified = true;
}
if (new_clu.flags == ALLOC_FAT_CHAIN)
if (exfat_ent_set(sb, last_clu, new_clu.dir))
@@ -216,33 +221,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
num_clusters += num_to_be_allocated;
*clu = new_clu.dir;
- if (ei->dir.dir != DIR_DELETED && modified) {
- struct exfat_dentry *ep;
- struct exfat_entry_set_cache *es;
- int err;
-
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry,
- ES_ALL_ENTRIES);
- if (!es)
- return -EIO;
- /* get stream entry */
- ep = exfat_get_dentry_cached(es, 1);
-
- /* update directory entry */
- ep->dentry.stream.flags = ei->flags;
- ep->dentry.stream.start_clu =
- cpu_to_le32(ei->start_clu);
- ep->dentry.stream.valid_size =
- cpu_to_le64(i_size_read(inode));
- ep->dentry.stream.size =
- ep->dentry.stream.valid_size;
-
- exfat_update_dir_chksum_with_entry_set(es);
- err = exfat_free_dentry_set(es, inode_needs_sync(inode));
- if (err)
- return err;
- } /* end of if != DIR_DELETED */
-
inode->i_blocks +=
num_to_be_allocated << sbi->sect_per_clus_bits;
@@ -357,9 +335,9 @@ unlock_ret:
return err;
}
-static int exfat_readpage(struct file *file, struct page *page)
+static int exfat_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, exfat_get_block);
+ return mpage_read_folio(folio, exfat_get_block);
}
static void exfat_readahead(struct readahead_control *rac)
@@ -384,18 +362,19 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = current_time(inode);
exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned);
}
}
static int exfat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int flags,
+ loff_t pos, unsigned int len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
exfat_get_block,
&EXFAT_I(mapping->host)->i_size_ondisk);
@@ -492,7 +471,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
static const struct address_space_operations exfat_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = exfat_readpage,
+ .read_folio = exfat_read_folio,
.readahead = exfat_readahead,
.writepage = exfat_writepage,
.writepages = exfat_writepages,
diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c
index d5bd8e6d9741..2e1a1a6b1021 100644
--- a/fs/exfat/misc.c
+++ b/fs/exfat/misc.c
@@ -46,23 +46,6 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
}
}
-/*
- * exfat_msg() - print preformated EXFAT specific messages.
- * All logs except what uses exfat_fs_error() should be written by exfat_msg()
- */
-void exfat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- /* level means KERN_ pacility level */
- printk("%sexFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
- va_end(args);
-}
-
#define SECS_PER_MIN (60)
#define TIMEZONE_SEC(x) ((x) * 15 * SECS_PER_MIN)
@@ -74,6 +57,13 @@ static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off)
ts->tv_sec += TIMEZONE_SEC(0x80 - tz_off);
}
+static inline int exfat_tz_offset(struct exfat_sb_info *sbi)
+{
+ if (sbi->options.sys_tz)
+ return -sys_tz.tz_minuteswest;
+ return sbi->options.time_offset;
+}
+
/* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */
void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
u8 tz, __le16 time, __le16 date, u8 time_cs)
@@ -96,8 +86,7 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts,
/* Adjust timezone to UTC0. */
exfat_adjust_tz(ts, tz & ~EXFAT_TZ_VALID);
else
- /* Convert from local time to UTC using time_offset. */
- ts->tv_sec -= sbi->options.time_offset * SECS_PER_MIN;
+ ts->tv_sec -= exfat_tz_offset(sbi) * SECS_PER_MIN;
}
/* Convert linear UNIX date to a EXFAT time/date pair. */
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index a02a04a993bf..b617bebc3d0f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -318,7 +318,6 @@ static int exfat_find_empty_entry(struct inode *inode,
unsigned int ret, last_clu;
loff_t size = 0;
struct exfat_chain clu;
- struct exfat_dentry *ep = NULL;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -383,25 +382,6 @@ static int exfat_find_empty_entry(struct inode *inode,
p_dir->size++;
size = EXFAT_CLU_TO_B(p_dir->size, sbi);
- /* update the directory entry */
- if (p_dir->dir != sbi->root_dir) {
- struct buffer_head *bh;
-
- ep = exfat_get_dentry(sb,
- &(ei->dir), ei->entry + 1, &bh);
- if (!ep)
- return -EIO;
-
- ep->dentry.stream.valid_size = cpu_to_le64(size);
- ep->dentry.stream.size = ep->dentry.stream.valid_size;
- ep->dentry.stream.flags = p_dir->flags;
- exfat_update_bh(bh, IS_DIRSYNC(inode));
- brelse(bh);
- if (exfat_update_dir_chksum(inode, &(ei->dir),
- ei->entry))
- return -EIO;
- }
-
/* directory inode should be updated in here */
i_size_write(inode, size);
ei->i_size_ondisk += sbi->cluster_size;
@@ -462,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path,
return namelen; /* return error value */
if ((lossy && !lookup) || !namelen)
- return -EINVAL;
+ return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL;
exfat_chain_set(p_dir, ei->start_clu,
EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
@@ -1080,6 +1060,7 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
exfat_remove_entries(inode, p_dir, oldentry, 0,
num_old_entries);
+ ei->dir = *p_dir;
ei->entry = newentry;
} else {
if (exfat_get_entry_type(epold) == TYPE_FILE) {
@@ -1167,28 +1148,6 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
return 0;
}
-static void exfat_update_parent_info(struct exfat_inode_info *ei,
- struct inode *parent_inode)
-{
- struct exfat_sb_info *sbi = EXFAT_SB(parent_inode->i_sb);
- struct exfat_inode_info *parent_ei = EXFAT_I(parent_inode);
- loff_t parent_isize = i_size_read(parent_inode);
-
- /*
- * the problem that struct exfat_inode_info caches wrong parent info.
- *
- * because of flag-mismatch of ei->dir,
- * there is abnormal traversing cluster chain.
- */
- if (unlikely(parent_ei->flags != ei->dir.flags ||
- parent_isize != EXFAT_CLU_TO_B(ei->dir.size, sbi) ||
- parent_ei->start_clu != ei->dir.dir)) {
- exfat_chain_set(&ei->dir, parent_ei->start_clu,
- EXFAT_B_TO_CLU_ROUND_UP(parent_isize, sbi),
- parent_ei->flags);
- }
-}
-
/* rename or move a old file into a new file */
static int __exfat_rename(struct inode *old_parent_inode,
struct exfat_inode_info *ei, struct inode *new_parent_inode,
@@ -1219,9 +1178,9 @@ static int __exfat_rename(struct inode *old_parent_inode,
return -ENOENT;
}
- exfat_update_parent_info(ei, old_parent_inode);
-
- exfat_chain_dup(&olddir, &ei->dir);
+ exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi),
+ EXFAT_I(old_parent_inode)->flags);
dentry = ei->entry;
ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
@@ -1241,8 +1200,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
goto out;
}
- exfat_update_parent_info(new_ei, new_parent_inode);
-
p_dir = &(new_ei->dir);
new_entry = new_ei->entry;
ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index ef115e673406..705710f93e2d 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -509,7 +509,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb,
}
if (unilen > MAX_NAME_LENGTH) {
- exfat_err(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d",
+ exfat_debug(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d",
__func__, len, unilen, MAX_NAME_LENGTH);
return -ENAMETOOLONG;
}
@@ -671,7 +671,7 @@ static int exfat_load_upcase_table(struct super_block *sb,
bh = sb_bread(sb, sector);
if (!bh) {
- exfat_err(sb, "failed to read sector(0x%llx)\n",
+ exfat_err(sb, "failed to read sector(0x%llx)",
(unsigned long long)sector);
ret = -EIO;
goto free_table;
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8ca21e7917d1..35f0305cd493 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -170,7 +170,9 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",discard");
if (opts->keep_last_dots)
seq_puts(m, ",keep_last_dots");
- if (opts->time_offset)
+ if (opts->sys_tz)
+ seq_puts(m, ",sys_tz");
+ else if (opts->time_offset)
seq_printf(m, ",time_offset=%d", opts->time_offset);
return 0;
}
@@ -214,6 +216,7 @@ enum {
Opt_errors,
Opt_discard,
Opt_keep_last_dots,
+ Opt_sys_tz,
Opt_time_offset,
/* Deprecated options */
@@ -241,6 +244,7 @@ static const struct fs_parameter_spec exfat_parameters[] = {
fsparam_enum("errors", Opt_errors, exfat_param_enums),
fsparam_flag("discard", Opt_discard),
fsparam_flag("keep_last_dots", Opt_keep_last_dots),
+ fsparam_flag("sys_tz", Opt_sys_tz),
fsparam_s32("time_offset", Opt_time_offset),
__fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated,
NULL),
@@ -298,6 +302,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_keep_last_dots:
opts->keep_last_dots = 1;
break;
+ case Opt_sys_tz:
+ opts->sys_tz = 1;
+ break;
case Opt_time_offset:
/*
* Make the limit 24 just in case someone invents something
@@ -457,7 +464,7 @@ static int exfat_read_boot_sector(struct super_block *sb)
*/
if (p_boot->sect_size_bits < EXFAT_MIN_SECT_SIZE_BITS ||
p_boot->sect_size_bits > EXFAT_MAX_SECT_SIZE_BITS) {
- exfat_err(sb, "bogus sector size bits : %u\n",
+ exfat_err(sb, "bogus sector size bits : %u",
p_boot->sect_size_bits);
return -EINVAL;
}
@@ -466,7 +473,7 @@ static int exfat_read_boot_sector(struct super_block *sb)
* sect_per_clus_bits could be at least 0 and at most 25 - sect_size_bits.
*/
if (p_boot->sect_per_clus_bits > EXFAT_MAX_SECT_PER_CLUS_BITS(p_boot)) {
- exfat_err(sb, "bogus sectors bits per cluster : %u\n",
+ exfat_err(sb, "bogus sectors bits per cluster : %u",
p_boot->sect_per_clus_bits);
return -EINVAL;
}
@@ -627,13 +634,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
if (opts->allow_utime == (unsigned short)-1)
opts->allow_utime = ~opts->fs_dmask & 0022;
- if (opts->discard) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
- opts->discard = 0;
- }
+ if (opts->discard && !bdev_max_discard_sectors(sb->s_bdev)) {
+ exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard");
+ opts->discard = 0;
}
sb->s_flags |= SB_NODIRATIME;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0106eba46d5a..c648a493faf2 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
+ tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf));
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
err = PTR_ERR(tmp);
@@ -248,21 +248,20 @@ struct getdents_callback {
* A rather strange filldir function to capture
* the name matching the specified inode number.
*/
-static int filldir_one(struct dir_context *ctx, const char *name, int len,
+static bool filldir_one(struct dir_context *ctx, const char *name, int len,
loff_t pos, u64 ino, unsigned int d_type)
{
struct getdents_callback *buf =
container_of(ctx, struct getdents_callback, ctx);
- int result = 0;
buf->sequence++;
if (buf->ino == ino && len <= NAME_MAX) {
memcpy(buf->name, name, len);
buf->name[len] = '\0';
buf->found = 1;
- result = -1;
+ return false; // no more
}
- return result;
+ return true;
}
/**
@@ -525,7 +524,8 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
}
inode_lock(target_dir->d_inode);
- nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+ nresult = lookup_one(mnt_user_ns(mnt), nbuf,
+ target_dir, strlen(nbuf));
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2c2f179b6977..8f597753ac12 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -200,19 +200,19 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n,
int quiet, void **page_addr)
{
struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
- if (!IS_ERR(page)) {
- *page_addr = kmap_local_page(page);
- if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !ext2_check_page(page, quiet,
- *page_addr))
- goto fail;
- }
+ struct folio *folio = read_mapping_folio(mapping, n, NULL);
+
+ if (IS_ERR(folio))
+ return &folio->page;
+ *page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1));
+ if (unlikely(!folio_test_checked(folio))) {
+ if (!ext2_check_page(&folio->page, quiet, *page_addr))
+ goto fail;
}
- return page;
+ return &folio->page;
fail:
- ext2_put_page(page, *page_addr);
+ ext2_put_page(&folio->page, *page_addr);
return ERR_PTR(-EIO);
}
@@ -672,17 +672,14 @@ int ext2_empty_dir (struct inode * inode)
void *page_addr = NULL;
struct page *page = NULL;
unsigned long i, npages = dir_pages(inode);
- int dir_has_error = 0;
for (i = 0; i < npages; i++) {
char *kaddr;
ext2_dirent * de;
- page = ext2_get_page(inode, i, dir_has_error, &page_addr);
+ page = ext2_get_page(inode, i, 0, &page_addr);
- if (IS_ERR(page)) {
- dir_has_error = 1;
- continue;
- }
+ if (IS_ERR(page))
+ goto not_empty;
kaddr = page_addr;
de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index d4f306aa5ace..28de11a22e5f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -795,7 +795,6 @@ extern const struct file_operations ext2_file_operations;
/* inode.c */
extern void ext2_set_file_ops(struct inode *inode);
extern const struct address_space_operations ext2_aops;
-extern const struct address_space_operations ext2_nobh_aops;
extern const struct iomap_ops ext2_iomap_ops;
/* namei.c */
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 52377a0ee735..918ab2f9e4c0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -36,7 +36,6 @@
#include <linux/iomap.h>
#include <linux/namei.h>
#include <linux/uio.h>
-#include <linux/dax.h>
#include "ext2.h"
#include "acl.h"
#include "xattr.h"
@@ -875,9 +874,9 @@ static int ext2_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, ext2_get_block, wbc);
}
-static int ext2_readpage(struct file *file, struct page *page)
+static int ext2_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, ext2_get_block);
+ return mpage_read_folio(folio, ext2_get_block);
}
static void ext2_readahead(struct readahead_control *rac)
@@ -887,13 +886,11 @@ static void ext2_readahead(struct readahead_control *rac)
static int
ext2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- ext2_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, ext2_get_block);
if (ret < 0)
ext2_write_failed(mapping, pos + len);
return ret;
@@ -911,26 +908,6 @@ static int ext2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
-static int
-ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- int ret;
-
- ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
- ext2_get_block);
- if (ret < 0)
- ext2_write_failed(mapping, pos + len);
- return ret;
-}
-
-static int ext2_nobh_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- return nobh_writepage(page, ext2_get_block, wbc);
-}
-
static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,ext2_get_block);
@@ -969,7 +946,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
const struct address_space_operations ext2_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = ext2_readpage,
+ .read_folio = ext2_read_folio,
.readahead = ext2_readahead,
.writepage = ext2_writepage,
.write_begin = ext2_write_begin,
@@ -977,26 +954,11 @@ const struct address_space_operations ext2_aops = {
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
-const struct address_space_operations ext2_nobh_aops = {
- .dirty_folio = block_dirty_folio,
- .invalidate_folio = block_invalidate_folio,
- .readpage = ext2_readpage,
- .readahead = ext2_readahead,
- .writepage = ext2_nobh_writepage,
- .write_begin = ext2_nobh_write_begin,
- .write_end = nobh_write_end,
- .bmap = ext2_bmap,
- .direct_IO = ext2_direct_IO,
- .writepages = ext2_writepages,
- .migratepage = buffer_migrate_page,
- .error_remove_page = generic_error_remove_page,
-};
-
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
.direct_IO = noop_direct_IO,
@@ -1302,13 +1264,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);
- if (IS_DAX(inode)) {
+ if (IS_DAX(inode))
error = dax_zero_range(inode, newsize,
PAGE_ALIGN(newsize) - newsize, NULL,
&ext2_iomap_ops);
- } else if (test_opt(inode->i_sb, NOBH))
- error = nobh_truncate_page(inode->i_mapping,
- newsize, ext2_get_block);
else
error = block_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
@@ -1400,8 +1359,6 @@ void ext2_set_file_ops(struct inode *inode)
inode->i_fop = &ext2_file_operations;
if (IS_DAX(inode))
inode->i_mapping->a_ops = &ext2_dax_aops;
- else if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
else
inode->i_mapping->a_ops = &ext2_aops;
}
@@ -1501,10 +1458,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (ext2_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
@@ -1514,10 +1468,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
} else {
inode->i_op = &ext2_symlink_inode_operations;
inode_nohighmem(inode);
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
}
} else {
inode->i_op = &ext2_special_inode_operations;
@@ -1553,7 +1504,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
if (IS_ERR(raw_inode))
return -EIO;
- /* For fields not not tracking in the in-memory inode,
+ /* For fields not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ei->i_state & EXT2_STATE_NEW)
memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
@@ -1683,14 +1634,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error)
return error;
- if (is_quota_modification(inode, iattr)) {
+ if (is_quota_modification(mnt_userns, inode, iattr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
- (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
- error = dquot_transfer(inode, iattr);
+ if (i_uid_needs_update(mnt_userns, iattr, inode) ||
+ i_gid_needs_update(mnt_userns, iattr, inode)) {
+ error = dquot_transfer(mnt_userns, inode, iattr);
if (error)
return error;
}
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 5f6b7560eb3f..5fd9a22d2b70 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -178,10 +178,7 @@ static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir,
/* slow symlink */
inode->i_op = &ext2_symlink_inode_operations;
inode_nohighmem(inode);
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
err = page_symlink(inode, symname, l);
if (err)
goto out_fail;
@@ -247,10 +244,7 @@ static int ext2_mkdir(struct user_namespace * mnt_userns,
inode->i_op = &ext2_dir_inode_operations;
inode->i_fop = &ext2_dir_operations;
- if (test_opt(inode->i_sb, NOBH))
- inode->i_mapping->a_ops = &ext2_nobh_aops;
- else
- inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_mapping->a_ops = &ext2_aops;
inode_inc_link_count(inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f6a19f6d9f6d..03f2af98b1b4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -163,7 +163,7 @@ static void ext2_put_super (struct super_block * sb)
db_count = sbi->s_gdb_count;
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
+ kvfree(sbi->s_group_desc);
kfree(sbi->s_debts);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb)
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}
@@ -296,9 +296,6 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noacl");
#endif
- if (test_opt(sb, NOBH))
- seq_puts(seq, ",nobh");
-
if (test_opt(sb, USRQUOTA))
seq_puts(seq, ",usrquota");
@@ -551,7 +548,8 @@ static int parse_options(char *options, struct super_block *sb,
clear_opt (opts->s_mount_opt, OLDALLOC);
break;
case Opt_nobh:
- set_opt (opts->s_mount_opt, NOBH);
+ ext2_msg(sb, KERN_INFO,
+ "nobh option not supported");
break;
#ifdef CONFIG_EXT2_FS_XATTR
case Opt_user_xattr:
@@ -835,7 +833,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);
spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@@ -1053,27 +1052,47 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group);
goto failed_mount;
}
+ /* At least inode table, bitmaps, and sb have to fit in one group */
+ if (sbi->s_blocks_per_group <= sbi->s_itb_per_group + 3) {
+ ext2_msg(sb, KERN_ERR,
+ "error: #blocks per group smaller than metadata size: %lu <= %lu",
+ sbi->s_blocks_per_group, sbi->s_inodes_per_group + 3);
+ goto failed_mount;
+ }
if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
ext2_msg(sb, KERN_ERR,
"error: #fragments per group too big: %lu",
sbi->s_frags_per_group);
goto failed_mount;
}
- if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
+ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext2_msg(sb, KERN_ERR,
- "error: #inodes per group too big: %lu",
+ "error: invalid #inodes per group: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
+ if (sb_bdev_nr_blocks(sb) < le32_to_cpu(es->s_blocks_count)) {
+ ext2_msg(sb, KERN_ERR,
+ "bad geometry: block count %u exceeds size of device (%u blocks)",
+ le32_to_cpu(es->s_blocks_count),
+ (unsigned)sb_bdev_nr_blocks(sb));
+ goto failed_mount;
+ }
- if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext2;
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+ if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu",
+ le32_to_cpu(es->s_inodes_count),
+ (u64)sbi->s_groups_count * sbi->s_inodes_per_group);
+ goto failed_mount;
+ }
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc_array(db_count,
+ sbi->s_group_desc = kvmalloc_array(db_count,
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
@@ -1199,12 +1218,12 @@ failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
failed_mount_group_desc:
- kfree(sbi->s_group_desc);
+ kvfree(sbi->s_group_desc);
kfree(sbi->s_debts);
failed_mount:
brelse(bh);
failed_sbi:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
@@ -1490,8 +1509,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
tmp_bh.b_size = sb->s_blocksize;
@@ -1529,8 +1547,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
struct buffer_head *bh;
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
tmp_bh.b_size = sb->s_blocksize;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 841fa6d9d744..641abfa4b718 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -517,36 +517,36 @@ bad_block:
/* Here we know that we can set the new attribute. */
if (header) {
- /* assert(header == HDR(bh)); */
+ int offset;
+
lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) {
__u32 hash = le32_to_cpu(header->h_hash);
+ struct mb_cache_entry *oe;
- ea_bdebug(bh, "modifying in-place");
+ oe = mb_cache_entry_delete_or_get(EA_BLOCK_CACHE(inode),
+ hash, bh->b_blocknr);
+ if (!oe) {
+ ea_bdebug(bh, "modifying in-place");
+ goto update_block;
+ }
/*
- * This must happen under buffer lock for
- * ext2_xattr_set2() to reliably detect modified block
+ * Someone is trying to reuse the block, leave it alone
*/
- mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
- bh->b_blocknr);
-
- /* keep the buffer locked while modifying it. */
- } else {
- int offset;
-
- unlock_buffer(bh);
- ea_bdebug(bh, "cloning");
- header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL);
- error = -ENOMEM;
- if (header == NULL)
- goto cleanup;
- header->h_refcount = cpu_to_le32(1);
-
- offset = (char *)here - bh->b_data;
- here = ENTRY((char *)header + offset);
- offset = (char *)last - bh->b_data;
- last = ENTRY((char *)header + offset);
+ mb_cache_entry_put(EA_BLOCK_CACHE(inode), oe);
}
+ unlock_buffer(bh);
+ ea_bdebug(bh, "cloning");
+ header = kmemdup(HDR(bh), bh->b_size, GFP_KERNEL);
+ error = -ENOMEM;
+ if (header == NULL)
+ goto cleanup;
+ header->h_refcount = cpu_to_le32(1);
+
+ offset = (char *)here - bh->b_data;
+ here = ENTRY((char *)header + offset);
+ offset = (char *)last - bh->b_data;
+ last = ENTRY((char *)header + offset);
} else {
/* Allocate a buffer where we construct the new block. */
header = kzalloc(sb->s_blocksize, GFP_KERNEL);
@@ -559,6 +559,7 @@ bad_block:
last = here = ENTRY(header+1);
}
+update_block:
/* Iff we are modifying the block in-place, bh is locked here. */
if (not_found) {
@@ -651,6 +652,55 @@ cleanup:
return error;
}
+static void ext2_xattr_release_block(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
+
+retry_ref:
+ lock_buffer(bh);
+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+ struct mb_cache_entry *oe;
+
+ /*
+ * This must happen under buffer lock to properly
+ * serialize with ext2_xattr_set() reusing the block.
+ */
+ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (oe) {
+ /*
+ * Someone is trying to reuse the block. Wait
+ * and retry.
+ */
+ unlock_buffer(bh);
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto retry_ref;
+ }
+
+ /* Free the old block. */
+ ea_bdebug(bh, "freeing");
+ ext2_free_blocks(inode, bh->b_blocknr, 1);
+ /* We let our caller release bh, so we
+ * need to duplicate the buffer before. */
+ get_bh(bh);
+ bforget(bh);
+ unlock_buffer(bh);
+ } else {
+ /* Decrement the refcount only. */
+ le32_add_cpu(&HDR(bh)->h_refcount, -1);
+ dquot_free_block(inode, 1);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+ ea_bdebug(bh, "refcount now=%d",
+ le32_to_cpu(HDR(bh)->h_refcount));
+ if (IS_SYNC(inode))
+ sync_dirty_buffer(bh);
+ }
+}
+
/*
* Second half of ext2_xattr_set(): Update the file system.
*/
@@ -747,34 +797,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
* If there was an old block and we are no longer using it,
* release the old block.
*/
- lock_buffer(old_bh);
- if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
- __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
-
- /*
- * This must happen under buffer lock for
- * ext2_xattr_set2() to reliably detect freed block
- */
- mb_cache_entry_delete(ea_block_cache, hash,
- old_bh->b_blocknr);
- /* Free the old block. */
- ea_bdebug(old_bh, "freeing");
- ext2_free_blocks(inode, old_bh->b_blocknr, 1);
- mark_inode_dirty(inode);
- /* We let our caller release old_bh, so we
- * need to duplicate the buffer before. */
- get_bh(old_bh);
- bforget(old_bh);
- } else {
- /* Decrement the refcount only. */
- le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
- dquot_free_block_nodirty(inode, 1);
- mark_inode_dirty(inode);
- mark_buffer_dirty(old_bh);
- ea_bdebug(old_bh, "refcount now=%d",
- le32_to_cpu(HDR(old_bh)->h_refcount));
- }
- unlock_buffer(old_bh);
+ ext2_xattr_release_block(inode, old_bh);
}
cleanup:
@@ -828,30 +851,7 @@ ext2_xattr_delete_inode(struct inode *inode)
EXT2_I(inode)->i_file_acl);
goto cleanup;
}
- lock_buffer(bh);
- if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
- __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-
- /*
- * This must happen under buffer lock for ext2_xattr_set2() to
- * reliably detect freed block
- */
- mb_cache_entry_delete(EA_BLOCK_CACHE(inode), hash,
- bh->b_blocknr);
- ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
- get_bh(bh);
- bforget(bh);
- unlock_buffer(bh);
- } else {
- le32_add_cpu(&HDR(bh)->h_refcount, -1);
- ea_bdebug(bh, "refcount now=%d",
- le32_to_cpu(HDR(bh)->h_refcount));
- unlock_buffer(bh);
- mark_buffer_dirty(bh);
- if (IS_SYNC(inode))
- sync_dirty_buffer(bh);
- dquot_free_block_nodirty(inode, 1);
- }
+ ext2_xattr_release_block(inode, bh);
EXT2_I(inode)->i_file_acl = 0;
cleanup:
@@ -943,7 +943,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
+
ce = mb_cache_entry_find_first(ea_block_cache, hash);
while (ce) {
struct buffer_head *bh;
@@ -955,22 +955,8 @@ again:
inode->i_ino, (unsigned long) ce->e_value);
} else {
lock_buffer(bh);
- /*
- * We have to be careful about races with freeing or
- * rehashing of xattr block. Once we hold buffer lock
- * xattr block's state is stable so we can check
- * whether the block got freed / rehashed or not.
- * Since we unhash mbcache entry under buffer lock when
- * freeing / rehashing xattr block, checking whether
- * entry is still hashed is reliable.
- */
- if (hlist_bl_unhashed(&ce->e_hash_list)) {
- mb_cache_entry_put(ea_block_cache, ce);
- unlock_buffer(bh);
- brelse(bh);
- goto again;
- } else if (le32_to_cpu(HDR(bh)->h_refcount) >
- EXT2_XATTR_REFCOUNT_MAX) {
+ if (le32_to_cpu(HDR(bh)->h_refcount) >
+ EXT2_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
(unsigned long) ce->e_value,
le32_to_cpu(HDR(bh)->h_refcount),
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 7d89142e1421..72206a292676 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -17,3 +17,4 @@ ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
ext4-inode-test-objs += inode-test.o
obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
+ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 78ee3ef795ae..8ff4b9192a9f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -666,7 +666,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* it's possible we've just missed a transaction commit here,
* so ignore the returned status
*/
- jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
(void) jbd2_journal_force_commit_nested(sbi->s_journal);
return 1;
}
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..e20ac0654b3f
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/quotaops.h>
+#include <linux/uuid.h>
+
+#include "ext4.h"
+#include "xattr.h"
+#include "ext4_jbd2.h"
+
+static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
+ const struct fscrypt_name *src)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ dst->usr_fname = src->usr_fname;
+ dst->disk_name = src->disk_name;
+ dst->hinfo.hash = src->hash;
+ dst->hinfo.minor_hash = src->minor_hash;
+ dst->crypto_buf = src->crypto_buf;
+}
+
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_setup_filename(dir, iname, lookup, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ err = ext4_fname_setup_ci_filename(dir, iname, fname);
+#endif
+ return err;
+}
+
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_prepare_lookup(dir, dentry, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
+#endif
+ return err;
+}
+
+void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+
+ name.crypto_buf = fname->crypto_buf;
+ fscrypt_free_filename(&name);
+
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
+}
+
+static bool uuid_is_zero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return false;
+ return true;
+}
+
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg)
+{
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, err2;
+ handle_t *handle;
+
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+
+ if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto pwsalt_err_exit;
+ }
+ err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto pwsalt_err_journal;
+ lock_buffer(sbi->s_sbh);
+ generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+pwsalt_err_journal:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+pwsalt_err_exit:
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
+}
+
+static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ handle_t *handle = fs_data;
+ int res, res2, credits, retries = 0;
+
+ /*
+ * Encrypting the root directory is not allowed because e2fsck expects
+ * lost+found to exist and be unencrypted, and encrypting the root
+ * directory would imply encrypting the lost+found directory as well as
+ * the filename "lost+found" itself.
+ */
+ if (inode->i_ino == EXT4_ROOT_INO)
+ return -EPERM;
+
+ if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
+ return -EINVAL;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
+ res = ext4_convert_inline_data(inode);
+ if (res)
+ return res;
+
+ /*
+ * If a journal handle was specified, then the encryption context is
+ * being set on a new inode via inheritance and is part of a larger
+ * transaction to create the inode. Otherwise the encryption context is
+ * being set on an existing inode in its own transaction. Only in the
+ * latter case should the "retry on ENOSPC" logic be used.
+ */
+
+ if (handle) {
+ res = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ }
+ return res;
+ }
+
+ res = dquot_initialize(inode);
+ if (res)
+ return res;
+retry:
+ res = ext4_xattr_set_credits(inode, len, false /* is_create */,
+ &credits);
+ if (res)
+ return res;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+ if (!res) {
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ /*
+ * Update inode->i_flags - S_ENCRYPTED will be enabled,
+ * S_DAX may be disabled
+ */
+ ext4_set_inode_flags(inode, false);
+ res = ext4_mark_inode_dirty(handle, inode);
+ if (res)
+ EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+ }
+ res2 = ext4_journal_stop(handle);
+
+ if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ if (!res)
+ res = res2;
+ return res;
+}
+
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
+{
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
+}
+
+static bool ext4_has_stable_inodes(struct super_block *sb)
+{
+ return ext4_has_feature_stable_inodes(sb);
+}
+
+static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
+ int *ino_bits_ret, int *lblk_bits_ret)
+{
+ *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
+ *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
+}
+
+const struct fscrypt_operations ext4_cryptops = {
+ .key_prefix = "ext4:",
+ .get_context = ext4_get_context,
+ .set_context = ext4_set_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
+ .empty_dir = ext4_empty_dir,
+ .has_stable_inodes = ext4_has_stable_inodes,
+ .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
+};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index a6bb86f52b9a..3985f8c33f95 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -412,7 +412,7 @@ struct fname {
};
/*
- * This functoin implements a non-recursive way of freeing all of the
+ * This function implements a non-recursive way of freeing all of the
* nodes in the red-black tree.
*/
static void free_rb_tree_fname(struct rb_root *root)
@@ -515,7 +515,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
/*
* This is a helper function for ext4_dx_readdir. It calls filldir
- * for all entres on the fname linked list. (Normally there is only
+ * for all entries on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
static int call_filldir(struct file *file, struct dir_context *ctx,
@@ -648,7 +648,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size;
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3f87cca49f0c..8d5453852f98 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -167,8 +167,6 @@ enum SHIFT_DIRECTION {
#define EXT4_MB_CR0_OPTIMIZED 0x8000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
#define EXT4_MB_CR1_OPTIMIZED 0x00010000
-/* Perform linear traversal for one group */
-#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
struct inode *inode;
@@ -673,6 +671,8 @@ enum {
/* Caller will submit data before dropping transaction handle. This
* allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Caller is in the atomic contex, find extent if it has been cached */
+#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
/*
* The bit position of these flags must not overlap with any of the
@@ -722,6 +722,8 @@ enum {
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32)
+#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid)
+#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid)
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
@@ -751,6 +753,15 @@ enum {
EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+/*
+ * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID
+ */
+struct fsuuid {
+ __u32 fsu_len;
+ __u32 fsu_flags;
+ __u8 fsu_uuid[];
+};
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
@@ -1440,12 +1451,6 @@ struct ext4_super_block {
#ifdef __KERNEL__
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
-#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
-#endif
-
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
@@ -1593,8 +1598,8 @@ struct ext4_sb_info {
struct list_head s_discard_list;
struct work_struct s_discard_work;
atomic_t s_retry_alloc_pending;
- struct rb_root s_mb_avg_fragment_size_root;
- rwlock_t s_mb_rb_lock;
+ struct list_head *s_mb_avg_fragment_size;
+ rwlock_t *s_mb_avg_fragment_size_locks;
struct list_head *s_mb_largest_free_orders;
rwlock_t *s_mb_largest_free_orders_locks;
@@ -2273,6 +2278,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
@@ -2727,74 +2736,20 @@ extern int ext4_fname_setup_ci_filename(struct inode *dir,
struct ext4_filename *fname);
#endif
+/* ext4 encryption related stuff goes here crypto.c */
#ifdef CONFIG_FS_ENCRYPTION
-static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
- const struct fscrypt_name *src)
-{
- memset(dst, 0, sizeof(*dst));
-
- dst->usr_fname = src->usr_fname;
- dst->disk_name = src->disk_name;
- dst->hinfo.hash = src->hash;
- dst->hinfo.minor_hash = src->minor_hash;
- dst->crypto_buf = src->crypto_buf;
-}
-
-static inline int ext4_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup,
- struct ext4_filename *fname)
-{
- struct fscrypt_name name;
- int err;
-
- err = fscrypt_setup_filename(dir, iname, lookup, &name);
- if (err)
- return err;
-
- ext4_fname_from_fscrypt_name(fname, &name);
-
-#if IS_ENABLED(CONFIG_UNICODE)
- err = ext4_fname_setup_ci_filename(dir, iname, fname);
-#endif
- return err;
-}
-
-static inline int ext4_fname_prepare_lookup(struct inode *dir,
- struct dentry *dentry,
- struct ext4_filename *fname)
-{
- struct fscrypt_name name;
- int err;
-
- err = fscrypt_prepare_lookup(dir, dentry, &name);
- if (err)
- return err;
+extern const struct fscrypt_operations ext4_cryptops;
- ext4_fname_from_fscrypt_name(fname, &name);
-
-#if IS_ENABLED(CONFIG_UNICODE)
- err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
-#endif
- return err;
-}
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname);
-static inline void ext4_fname_free_filename(struct ext4_filename *fname)
-{
- struct fscrypt_name name;
+int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
+ struct ext4_filename *fname);
- name.crypto_buf = fname->crypto_buf;
- fscrypt_free_filename(&name);
+void ext4_fname_free_filename(struct ext4_filename *fname);
- fname->crypto_buf.name = NULL;
- fname->usr_fname = NULL;
- fname->disk_name.name = NULL;
+int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
-}
#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
const struct qstr *iname,
@@ -2827,6 +2782,12 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname)
fname->cf_name.name = NULL;
#endif
}
+
+static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
+ void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
#endif /* !CONFIG_FS_ENCRYPTION */
/* dir.c */
@@ -3016,6 +2977,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct user_namespace *, struct dentry *,
struct iattr *);
+extern u32 ext4_dio_alignment(struct inode *inode);
extern int ext4_getattr(struct user_namespace *, const struct path *,
struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
@@ -3032,7 +2994,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -3064,6 +3026,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
+int ext4_update_overhead(struct super_block *sb, bool force);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
@@ -3105,14 +3068,14 @@ extern unsigned int ext4_list_backups(struct super_block *sb,
/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
- sector_t block, int op_flags);
+ sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block);
-extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
+extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
bh_end_io_t *end_io);
-extern int ext4_read_bh(struct buffer_head *bh, int op_flags,
+extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
bh_end_io_t *end_io);
-extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait);
+extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
@@ -3449,6 +3412,8 @@ struct ext4_group_info {
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ int bb_avg_fragment_size_order; /* order of average
+ fragment in BG */
ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
ext4_group_t bb_group; /* Group number */
struct list_head bb_prealloc_list;
@@ -3456,7 +3421,7 @@ struct ext4_group_info {
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- struct rb_node bb_avg_fragment_size_rb;
+ struct list_head bb_avg_fragment_size_node;
struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
@@ -3586,7 +3551,6 @@ extern int ext4_readpage_inline(struct inode *inode, struct page *page);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep);
extern int ext4_write_inline_data_end(struct inode *inode,
loff_t pos, unsigned len,
@@ -3599,7 +3563,6 @@ ext4_journalled_write_inline_data(struct inode *inode,
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep,
void **fsdata);
extern int ext4_try_add_inline_entry(handle_t *handle,
@@ -3629,9 +3592,7 @@ extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len);
+extern void *ext4_read_inline_link(struct inode *inode);
struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);
@@ -3749,7 +3710,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path **,
int flags);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3848,7 +3809,7 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
extern int ext4_resize_begin(struct super_block *sb);
-extern void ext4_resize_end(struct super_block *sb);
+extern int ext4_resize_end(struct super_block *sb, bool update_backups);
static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3477a16d08ae..8e1fb18f465e 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -267,8 +267,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
trace_ext4_forget(inode, is_metadata, blocknr);
BUFFER_TRACE(bh, "enter");
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %x\n",
+ ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0d98cf402282..f1956288307f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -106,6 +106,25 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
return 0;
}
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+ int depth, i;
+
+ if (!path)
+ return;
+ depth = path->p_depth;
+ for (i = 0; i <= depth; i++, path++) {
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+ }
+}
+
+void ext4_free_ext_path(struct ext4_ext_path *path)
+{
+ ext4_ext_drop_refs(path);
+ kfree(path);
+}
+
/*
* Make sure 'handle' has at least 'check_cred' credits. If not, restart
* transaction with 'restart_cred' credits. The function drops i_data_sem
@@ -372,7 +391,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
{
unsigned short entries;
ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
+ ext4_lblk_t cur = 0;
if (eh->eh_entries == 0)
return 1;
@@ -396,11 +415,11 @@ static int ext4_valid_extent_entries(struct inode *inode,
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- if ((lblock <= prev) && prev) {
+ if (lblock < cur) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
- prev = lblock + ext4_ext_get_actual_len(ext) - 1;
+ cur = lblock + ext4_ext_get_actual_len(ext);
ext++;
entries--;
}
@@ -420,13 +439,13 @@ static int ext4_valid_extent_entries(struct inode *inode,
/* Check for overlapping index extents */
lblock = le32_to_cpu(ext_idx->ei_block);
- if ((lblock <= prev) && prev) {
+ if (lblock < cur) {
*pblk = ext4_idx_pblock(ext_idx);
return 0;
}
ext_idx++;
entries--;
- prev = lblock;
+ cur = lblock + 1;
}
}
return 1;
@@ -460,6 +479,10 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
+ if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
+ error_msg = "eh_entries is 0 but eh_depth is > 0";
+ goto corrupted;
+ }
if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
@@ -632,8 +655,7 @@ int ext4_ext_precache(struct inode *inode)
ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
up_read(&ei->i_data_sem);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -720,19 +742,6 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
#define ext4_ext_show_move(inode, path, newblock, level)
#endif
-void ext4_ext_drop_refs(struct ext4_ext_path *path)
-{
- int depth, i;
-
- if (!path)
- return;
- depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
-}
-
/*
* ext4_ext_binsearch_idx:
* binary search for the closest index of the given block
@@ -951,8 +960,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
return path;
err:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (orig_path)
*orig_path = NULL;
return ERR_PTR(ret);
@@ -2170,8 +2178,7 @@ merge:
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
- ext4_ext_drop_refs(npath);
- kfree(npath);
+ ext4_free_ext_path(npath);
return err;
}
@@ -3057,8 +3064,7 @@ again:
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
path = NULL;
if (err == -EAGAIN)
goto again;
@@ -4371,8 +4377,7 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
@@ -4500,9 +4505,9 @@ retry:
return ret > 0 ? ret2 : ret;
}
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
@@ -4574,6 +4579,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4689,22 +4698,24 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(inode, offset, len);
- goto exit;
- }
-
+ inode_lock(inode);
ret = ext4_convert_inline_data(inode);
+ inode_unlock(inode);
if (ret)
goto exit;
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ ret = ext4_punch_hole(file, offset, len);
+ goto exit;
+ }
+
if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(inode, offset, len);
+ ret = ext4_collapse_range(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(inode, offset, len);
+ ret = ext4_insert_range(file, offset, len);
goto exit;
}
@@ -4740,6 +4751,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if (ret)
goto out;
@@ -5231,8 +5246,7 @@ again:
break;
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -5241,8 +5255,9 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
@@ -5294,6 +5309,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5387,8 +5406,9 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
handle_t *handle;
@@ -5445,6 +5465,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5514,15 +5538,13 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (ret < 0) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
} else {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
ret = ext4_es_remove_extent(inode, offset_lblk,
@@ -5742,10 +5764,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
count -= len;
repeat:
- ext4_ext_drop_refs(path1);
- kfree(path1);
- ext4_ext_drop_refs(path2);
- kfree(path2);
+ ext4_free_ext_path(path1);
+ ext4_free_ext_path(path2);
path1 = path2 = NULL;
}
return replaced_count;
@@ -5824,8 +5844,7 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return err ? err : mapped;
}
@@ -5892,8 +5911,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_mark_inode_dirty(NULL, inode);
return ret;
}
@@ -5911,8 +5929,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
return;
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_mark_inode_dirty(NULL, inode);
return;
}
@@ -5925,8 +5942,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
up_write(&EXT4_I(inode)->i_data_sem);
ext4_mark_inode_dirty(NULL, inode);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
}
@@ -5965,13 +5981,11 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
goto out;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
/* Count the number of data blocks */
cur = 0;
@@ -6001,30 +6015,26 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (IS_ERR(path))
goto out;
numblks += path->p_depth;
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
while (cur < end) {
path = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path))
break;
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return 0;
}
cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
ext4_ext_get_actual_len(ex));
ret = skip_hole(inode, &cur);
if (ret < 0) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
break;
}
path2 = ext4_find_extent(inode, cur, NULL, 0);
if (IS_ERR(path2)) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
break;
}
for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
@@ -6038,10 +6048,8 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (cmp1 != cmp2 && cmp2 != 0)
numblks++;
}
- ext4_ext_drop_refs(path);
- ext4_ext_drop_refs(path2);
- kfree(path);
- kfree(path2);
+ ext4_free_ext_path(path);
+ ext4_free_ext_path(path2);
}
out:
@@ -6068,13 +6076,11 @@ int ext4_ext_clear_bb(struct inode *inode)
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return 0;
}
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
cur = 0;
while (cur < end) {
@@ -6093,8 +6099,7 @@ int ext4_ext_clear_bb(struct inode *inode)
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
0, path[j].p_block, 1, 1);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9a3a8996aacf..cd0a861853e3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -667,8 +667,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
}
}
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -1654,7 +1653,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker);
+ err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
+ sbi->s_sb->s_id);
if (err)
goto err4;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 3d72565ec6e8..ef05bfa87798 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -229,6 +229,12 @@ __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
finish_wait(wq, &wait.wq_entry);
}
+static bool ext4_fc_disabled(struct super_block *sb)
+{
+ return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
+}
+
/*
* Inform Ext4's fast about start of an inode update
*
@@ -240,8 +246,7 @@ void ext4_fc_start_update(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
restart:
@@ -265,8 +270,7 @@ void ext4_fc_stop_update(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (atomic_dec_and_test(&ei->i_fc_updates))
@@ -283,8 +287,7 @@ void ext4_fc_del(struct inode *inode)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
restart:
@@ -337,8 +340,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
struct ext4_sb_info *sbi = EXT4_SB(sb);
tid_t tid;
- if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
- (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(sb))
return;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
@@ -493,10 +495,8 @@ void __ext4_fc_track_unlink(handle_t *handle,
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -522,10 +522,8 @@ void __ext4_fc_track_link(handle_t *handle,
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -551,10 +549,8 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -576,22 +572,20 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
if (S_ISDIR(inode->i_mode))
return;
+ if (ext4_fc_disabled(inode->i_sb))
+ return;
+
if (ext4_should_journal_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb,
EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
return;
}
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
- return;
-
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
@@ -634,15 +628,13 @@ static int __track_range(struct inode *inode, void *arg, bool update)
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct __track_range_args args;
int ret;
if (S_ISDIR(inode->i_mode))
return;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
+ if (ext4_fc_disabled(inode->i_sb))
return;
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
@@ -658,7 +650,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
{
- int write_flags = REQ_SYNC;
+ blk_opf_t write_flags = REQ_SYNC;
struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
/* Add REQ_FUA | REQ_PREFLUSH only its tail */
@@ -668,7 +660,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
set_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = ext4_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE, write_flags, bh);
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
EXT4_SB(sb)->s_fc_bh = NULL;
}
@@ -710,10 +702,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
* After allocating len, we should have space at least for a 0 byte
* padding.
*/
- if (len + sizeof(struct ext4_fc_tl) > bsize)
+ if (len + EXT4_FC_TAG_BASE_LEN > bsize)
return NULL;
- if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
+ if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
/*
* Only allocate from current buffer if we have enough space for
* this request AND we have space to add a zero byte padding.
@@ -730,10 +722,10 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
/* Need to add PAD tag */
tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
- pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
+ pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
tl->fc_len = cpu_to_le16(pad_len);
if (crc)
- *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
+ *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
if (pad_len > 0)
ext4_fc_memzero(sb, tl + 1, pad_len, crc);
ext4_fc_submit_bh(sb, false);
@@ -775,7 +767,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
* ext4_fc_reserve_space takes care of allocating an extra block if
* there's no enough space on this block for accommodating this tail.
*/
- dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
if (!dst)
return -ENOSPC;
@@ -785,8 +777,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
- dst += sizeof(tl);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
+ dst += EXT4_FC_TAG_BASE_LEN;
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
dst += sizeof(tail.fc_tid);
@@ -808,15 +800,15 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
struct ext4_fc_tl tl;
u8 *dst;
- dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
+ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
if (!dst)
return false;
tl.fc_tag = cpu_to_le16(tag);
tl.fc_len = cpu_to_le16(len);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
- ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
return true;
}
@@ -828,8 +820,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
struct ext4_fc_dentry_info fcd;
struct ext4_fc_tl tl;
int dlen = fc_dentry->fcd_name.len;
- u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
- crc);
+ u8 *dst = ext4_fc_reserve_space(sb,
+ EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
if (!dst)
return false;
@@ -838,8 +830,8 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
- ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
- dst += sizeof(tl);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ dst += EXT4_FC_TAG_BASE_LEN;
ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
dst += sizeof(fcd);
ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
@@ -874,22 +866,25 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+ ret = -ECANCELED;
dst = ext4_fc_reserve_space(inode->i_sb,
- sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
+ EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
if (!dst)
- return -ECANCELED;
+ goto err;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
- return -ECANCELED;
- dst += sizeof(tl);
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
+ goto err;
+ dst += EXT4_FC_TAG_BASE_LEN;
if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
- return -ECANCELED;
+ goto err;
dst += sizeof(fc_inode);
if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
inode_len, crc))
- return -ECANCELED;
-
- return 0;
+ goto err;
+ ret = 0;
+err:
+ brelse(iloc.bh);
+ return ret;
}
/*
@@ -917,8 +912,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
mutex_unlock(&ei->i_fc_lock);
cur_lblk_off = old_blk_size;
- jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
- __func__, cur_lblk_off, new_blk_size, inode->i_ino);
+ ext4_debug("will try writing %d to %d for inode %ld\n",
+ cur_lblk_off, new_blk_size, inode->i_ino);
while (cur_lblk_off <= new_blk_size) {
map.m_lblk = cur_lblk_off;
@@ -970,7 +965,7 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
/* Submit data for all the fast commit inodes */
static int ext4_fc_submit_inode_data_all(journal_t *journal)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
int ret = 0;
@@ -1004,7 +999,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
/* Wait for completion of data for all the fast commit inodes */
static int ext4_fc_wait_inode_data_all(journal_t *journal)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *pos, *n;
int ret = 0;
@@ -1031,7 +1026,7 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
__acquires(&sbi->s_fc_lock)
__releases(&sbi->s_fc_lock)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
struct inode *inode;
@@ -1093,7 +1088,7 @@ lock_and_exit:
static int ext4_fc_perform_commit(journal_t *journal)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *iter;
struct ext4_fc_head head;
@@ -1168,7 +1163,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status,
{
struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
- jbd_debug(1, "Fast commit ended with status = %d for tid %u",
+ ext4_debug("Fast commit ended with status = %d for tid %u",
status, commit_tid);
if (status == EXT4_FC_STATUS_OK) {
stats->fc_num_commits++;
@@ -1198,7 +1193,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status,
*/
int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
{
- struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int nblks = 0, ret, bsize = journal->j_blocksize;
int subtid = atomic_read(&sbi->s_fc_subtid);
@@ -1343,7 +1338,7 @@ struct dentry_info_args {
};
static inline void tl_to_darg(struct dentry_info_args *darg,
- struct ext4_fc_tl *tl, u8 *val)
+ struct ext4_fc_tl *tl, u8 *val)
{
struct ext4_fc_dentry_info fcd;
@@ -1352,8 +1347,14 @@ static inline void tl_to_darg(struct dentry_info_args *darg,
darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
darg->ino = le32_to_cpu(fcd.fc_ino);
darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
- darg->dname_len = le16_to_cpu(tl->fc_len) -
- sizeof(struct ext4_fc_dentry_info);
+ darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
+}
+
+static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
+{
+ memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
+ tl->fc_len = le16_to_cpu(tl->fc_len);
+ tl->fc_tag = le16_to_cpu(tl->fc_tag);
}
/* Unlink replay function */
@@ -1375,14 +1376,14 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode %d not found", darg.ino);
+ ext4_debug("Inode %d not found", darg.ino);
return 0;
}
old_parent = ext4_iget(sb, darg.parent_ino,
EXT4_IGET_NORMAL);
if (IS_ERR(old_parent)) {
- jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
+ ext4_debug("Dir with inode %d not found", darg.parent_ino);
iput(inode);
return 0;
}
@@ -1407,21 +1408,21 @@ static int ext4_fc_replay_link_internal(struct super_block *sb,
dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
if (IS_ERR(dir)) {
- jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
+ ext4_debug("Dir with inode %d not found.", darg->parent_ino);
dir = NULL;
goto out;
}
dentry_dir = d_obtain_alias(dir);
if (IS_ERR(dentry_dir)) {
- jbd_debug(1, "Failed to obtain dentry");
+ ext4_debug("Failed to obtain dentry");
dentry_dir = NULL;
goto out;
}
dentry_inode = d_alloc(dentry_dir, &qstr_dname);
if (!dentry_inode) {
- jbd_debug(1, "Inode dentry not created.");
+ ext4_debug("Inode dentry not created.");
ret = -ENOMEM;
goto out;
}
@@ -1434,7 +1435,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb,
* could complete.
*/
if (ret && ret != -EEXIST) {
- jbd_debug(1, "Failed to link\n");
+ ext4_debug("Failed to link\n");
goto out;
}
@@ -1468,7 +1469,7 @@ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode not found.");
+ ext4_debug("Inode not found.");
return 0;
}
@@ -1491,13 +1492,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
if (state->fc_modified_inodes[i] == ino)
return 0;
if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
- state->fc_modified_inodes = krealloc(
- state->fc_modified_inodes,
+ int *fc_modified_inodes;
+
+ fc_modified_inodes = krealloc(state->fc_modified_inodes,
sizeof(int) * (state->fc_modified_inodes_size +
EXT4_FC_REPLAY_REALLOC_INCREMENT),
GFP_KERNEL);
- if (!state->fc_modified_inodes)
+ if (!fc_modified_inodes)
return -ENOMEM;
+ state->fc_modified_inodes = fc_modified_inodes;
state->fc_modified_inodes_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
}
@@ -1516,7 +1519,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
struct ext4_inode *raw_fc_inode;
struct inode *inode = NULL;
struct ext4_iloc iloc;
- int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
+ int inode_len, ino, ret, tag = tl->fc_tag;
struct ext4_extent_header *eh;
memcpy(&fc_inode, val, sizeof(fc_inode));
@@ -1541,7 +1544,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
if (ret)
goto out;
- inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
+ inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
raw_inode = ext4_raw_inode(&iloc);
memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
@@ -1576,7 +1579,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
/* Given that we just wrote the inode on disk, this SHOULD succeed. */
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode not found.");
+ ext4_debug("Inode not found.");
return -EFSCORRUPTED;
}
@@ -1630,7 +1633,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "inode %d not found.", darg.ino);
+ ext4_debug("inode %d not found.", darg.ino);
inode = NULL;
ret = -EINVAL;
goto out;
@@ -1643,7 +1646,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
*/
dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
if (IS_ERR(dir)) {
- jbd_debug(1, "Dir %d not found.", darg.ino);
+ ext4_debug("Dir %d not found.", darg.ino);
goto out;
}
ret = ext4_init_new_dir(NULL, dir, inode);
@@ -1659,8 +1662,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
set_nlink(inode, 1);
ext4_mark_inode_dirty(NULL, inode);
out:
- if (inode)
- iput(inode);
+ iput(inode);
return ret;
}
@@ -1683,15 +1685,18 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
if (replay && state->fc_regions_used != state->fc_regions_valid)
state->fc_regions_used = state->fc_regions_valid;
if (state->fc_regions_used == state->fc_regions_size) {
+ struct ext4_fc_alloc_region *fc_regions;
+
+ fc_regions = krealloc(state->fc_regions,
+ sizeof(struct ext4_fc_alloc_region) *
+ (state->fc_regions_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_regions)
+ return -ENOMEM;
state->fc_regions_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
- state->fc_regions = krealloc(
- state->fc_regions,
- state->fc_regions_size *
- sizeof(struct ext4_fc_alloc_region),
- GFP_KERNEL);
- if (!state->fc_regions)
- return -ENOMEM;
+ state->fc_regions = fc_regions;
}
region = &state->fc_regions[state->fc_regions_used++];
region->ino = ino;
@@ -1728,7 +1733,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode not found.");
+ ext4_debug("Inode not found.");
return 0;
}
@@ -1742,7 +1747,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
cur = start;
remaining = len;
- jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+ ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
start, start_pblk, len, ext4_ext_is_unwritten(ex),
inode->i_ino);
@@ -1771,8 +1776,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ret = ext4_ext_insert_extent(
NULL, inode, &path, &newex, 0);
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (ret)
goto out;
goto next;
@@ -1803,7 +1807,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
}
/* Range is mapped and needs a state change */
- jbd_debug(1, "Converting from %ld to %d %lld",
+ ext4_debug("Converting from %ld to %d %lld",
map.m_flags & EXT4_MAP_UNWRITTEN,
ext4_ext_is_unwritten(ex), map.m_pblk);
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
@@ -1846,7 +1850,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
+ ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
return 0;
}
@@ -1854,7 +1858,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
if (ret)
goto out;
- jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
+ ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
inode->i_ino, le32_to_cpu(lrange.fc_lblk),
le32_to_cpu(lrange.fc_len));
while (remaining > 0) {
@@ -1903,7 +1907,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
inode = ext4_iget(sb, state->fc_modified_inodes[i],
EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode %d not found.",
+ ext4_debug("Inode %d not found.",
state->fc_modified_inodes[i]);
continue;
}
@@ -1927,8 +1931,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, 1);
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
}
cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
@@ -1973,6 +1976,34 @@ void ext4_fc_replay_cleanup(struct super_block *sb)
kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}
+static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
+ u8 *val, u8 *end)
+{
+ if (val + tl->fc_len > end)
+ return false;
+
+ /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
+ * journal rescan before do CRC check. Other tags length check will
+ * rely on CRC check.
+ */
+ switch (tl->fc_tag) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
+ case EXT4_FC_TAG_TAIL:
+ return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
+ case EXT4_FC_TAG_HEAD:
+ return (sizeof(struct ext4_fc_head) == tl->fc_len);
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ case EXT4_FC_TAG_PAD:
+ default:
+ return true;
+ }
+}
+
/*
* Recovery Scan phase handler
*
@@ -2029,12 +2060,18 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_replay_expected_off++;
- for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
- memcpy(&tl, cur, sizeof(tl));
- val = cur + sizeof(tl);
- jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
- tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
- switch (le16_to_cpu(tl.fc_tag)) {
+ for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
+ if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ goto out_err;
+ }
+ ext4_debug("Scan phase, tag:%s, blk %lld\n",
+ tag2str(tl.fc_tag), bh->b_blocknr);
+ switch (tl.fc_tag) {
case EXT4_FC_TAG_ADD_RANGE:
memcpy(&ext, val, sizeof(ext));
ex = (struct ext4_extent *)&ext.fc_ex;
@@ -2054,13 +2091,13 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) + le16_to_cpu(tl.fc_len));
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) +
+ EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc));
if (le32_to_cpu(tail.fc_tid) == expected_tid &&
@@ -2087,7 +2124,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_cur_tag++;
state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
- sizeof(tl) + le16_to_cpu(tl.fc_len));
+ EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
default:
ret = state->fc_replay_num_tags ?
@@ -2127,7 +2164,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
sbi->s_mount_state |= EXT4_FC_REPLAY;
}
if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
- jbd_debug(1, "Replay stops\n");
+ ext4_debug("Replay stops\n");
ext4_fc_set_bitmaps_and_counters(sb);
return 0;
}
@@ -2142,19 +2179,20 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
start = (u8 *)bh->b_data;
end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
- for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
- memcpy(&tl, cur, sizeof(tl));
- val = cur + sizeof(tl);
+ for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
+ ext4_fc_get_tl(&tl, cur);
+ val = cur + EXT4_FC_TAG_BASE_LEN;
if (state->fc_replay_num_tags == 0) {
ret = JBD2_FC_REPLAY_STOP;
ext4_fc_set_bitmaps_and_counters(sb);
break;
}
- jbd_debug(3, "Replay phase, tag:%s\n",
- tag2str(le16_to_cpu(tl.fc_tag)));
+
+ ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
state->fc_replay_num_tags--;
- switch (le16_to_cpu(tl.fc_tag)) {
+ switch (tl.fc_tag) {
case EXT4_FC_TAG_LINK:
ret = ext4_fc_replay_link(sb, &tl, val);
break;
@@ -2175,19 +2213,18 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
break;
case EXT4_FC_TAG_PAD:
trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
- le16_to_cpu(tl.fc_len), 0);
+ tl.fc_len, 0);
break;
case EXT4_FC_TAG_TAIL:
- trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
- le16_to_cpu(tl.fc_len), 0);
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
+ 0, tl.fc_len, 0);
memcpy(&tail, val, sizeof(tail));
WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
break;
case EXT4_FC_TAG_HEAD:
break;
default:
- trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
- le16_to_cpu(tl.fc_len), 0);
+ trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
ret = -ECANCELED;
break;
}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 1db12847a83b..a6154c3ed135 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -70,6 +70,9 @@ struct ext4_fc_tail {
__le32 fc_crc;
};
+/* Tag base length */
+#define EXT4_FC_TAG_BASE_LEN (sizeof(struct ext4_fc_tl))
+
/*
* Fast commit status codes
*/
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6feb07e3e1eb..a7a597c727e6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,19 +36,34 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+/*
+ * Returns %true if the given DIO request should be attempted with DIO, or
+ * %false if it should fall back to buffered I/O.
+ *
+ * DIO isn't well specified; when it's unsupported (either due to the request
+ * being misaligned, or due to the file not supporting DIO at all), filesystems
+ * either fall back to buffered I/O or return EINVAL. For files that don't use
+ * any special features like encryption or verity, ext4 has traditionally
+ * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too.
+ * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
+ *
+ * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
+ * traditionally falls back to buffered I/O.
+ *
+ * This function implements the traditional ext4 behavior in all these cases.
+ */
+static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ u32 dio_align = ext4_dio_alignment(inode);
- if (!fscrypt_dio_supported(iocb, iter))
- return false;
- if (fsverity_active(inode))
- return false;
- if (ext4_should_journal_data(inode))
+ if (dio_align == 0)
return false;
- if (ext4_has_inline_data(inode))
- return false;
- return true;
+
+ if (dio_align == 1)
+ return true;
+
+ return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -63,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(iocb, to)) {
+ if (!ext4_should_use_dio(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -76,7 +91,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -511,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(iocb, from)) {
+ if (!ext4_should_use_dio(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
@@ -528,6 +543,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = -EAGAIN;
goto out;
}
+ /*
+ * Make sure inline data cannot be created anymore since we are going
+ * to allocate blocks for DIO. We know the inode does not have any
+ * inline data now because ext4_dio_supported() checked for that.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
offset = iocb->ki_pos;
count = ret;
@@ -565,7 +586,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
- 0);
+ NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f73e5eb43eae..208b87ce8858 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -510,7 +510,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
goto fallback;
}
- max_dirs = ndirs / ngroups + inodes_per_group / 16;
+ max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 07a8c75b65ed..860fc5119009 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -460,7 +460,7 @@ static int ext4_splice_branch(handle_t *handle,
* the new i_size. But that is not done here - it is done in
* generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
*/
- jbd_debug(5, "splicing indirect only\n");
+ ext4_debug("splicing indirect only\n");
BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
if (err)
@@ -472,7 +472,7 @@ static int ext4_splice_branch(handle_t *handle,
err = ext4_mark_inode_dirty(handle, ar->inode);
if (unlikely(err))
goto err_out;
- jbd_debug(5, "splicing direct\n");
+ ext4_debug("splicing direct\n");
}
return err;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c076262770d..a4fbe825694b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -6,6 +6,7 @@
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/namei.h>
#include <linux/iversion.h>
#include <linux/sched/mm.h>
@@ -35,6 +36,9 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
struct ext4_inode *raw_inode;
int free, min_offs;
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return 0;
+
min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
EXT4_GOOD_OLD_INODE_SIZE -
EXT4_I(inode)->i_extra_isize -
@@ -527,13 +531,13 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
}
static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
- struct inode *inode,
- unsigned flags)
+ struct inode *inode)
{
int ret, needed_blocks, no_expand;
handle_t *handle = NULL;
int retries = 0, sem_held = 0;
struct page *page = NULL;
+ unsigned int flags;
unsigned from, to;
struct ext4_iloc iloc;
@@ -562,9 +566,9 @@ retry:
/* We cannot recurse into the filesystem as the transaction is already
* started */
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out;
@@ -649,11 +653,11 @@ out:
int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep)
{
int ret;
handle_t *handle;
+ unsigned int flags;
struct page *page;
struct ext4_iloc iloc;
@@ -691,9 +695,9 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
if (ret)
goto out;
- flags |= AOP_FLAG_NOFS;
-
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out;
@@ -727,8 +731,7 @@ out:
brelse(iloc.bh);
return ret;
convert:
- return ext4_convert_inline_data_to_extent(mapping,
- inode, flags);
+ return ext4_convert_inline_data_to_extent(mapping, inode);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
@@ -848,13 +851,12 @@ ext4_journalled_write_inline_data(struct inode *inode,
*/
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
struct inode *inode,
- unsigned flags,
void **fsdata)
{
int ret = 0, inline_size;
struct page *page;
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ page = grab_cache_page_write_begin(mapping, 0);
if (!page)
return -ENOMEM;
@@ -907,7 +909,6 @@ out:
int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- unsigned flags,
struct page **pagep,
void **fsdata)
{
@@ -916,6 +917,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct page *page;
struct ext4_iloc iloc;
int retries = 0;
+ unsigned int flags;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -932,17 +934,10 @@ retry_journal:
if (ret && ret != -ENOSPC)
goto out_journal;
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- flags |= AOP_FLAG_NOFS;
-
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
ret = ext4_da_convert_inline_data_to_extent(mapping,
inode,
- flags,
fsdata);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -950,7 +945,13 @@ retry_journal:
goto out;
}
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, 0);
+ memalloc_nofs_restore(flags);
if (!page) {
ret = -ENOMEM;
goto out_journal;
@@ -1083,14 +1084,14 @@ static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
void *limit;
int de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
if (old_size) {
limit = de_buf + old_size;
do {
prev_de = de;
de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
de_buf += de_len;
- de = (struct ext4_dir_entry_2 *)de_buf;
+ de = de_buf;
} while (de_buf < limit);
prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
@@ -1155,7 +1156,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
* First create "." and ".." and then copy the dir information
* back to the block.
*/
- de = (struct ext4_dir_entry_2 *)target;
+ de = target;
de = ext4_init_dot_dotdot(inode, de,
inode->i_sb->s_blocksize, csum_size,
le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
@@ -1591,6 +1592,35 @@ out:
return ret;
}
+void *ext4_read_inline_link(struct inode *inode)
+{
+ struct ext4_iloc iloc;
+ int ret, inline_size;
+ void *link;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = -ENOMEM;
+ inline_size = ext4_get_inline_size(inode);
+ link = kmalloc(inline_size + 1, GFP_NOFS);
+ if (!link)
+ goto out;
+
+ ret = ext4_read_inline_data(inode, link, inline_size, &iloc);
+ if (ret < 0) {
+ kfree(link);
+ goto out;
+ }
+ nd_terminate_link(link, inode->i_size, ret);
+out:
+ if (ret < 0)
+ link = ERR_PTR(ret);
+ brelse(iloc.bh);
+ return link;
+}
+
struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
struct ext4_dir_entry_2 **parent_de,
int *retval)
@@ -2005,6 +2035,18 @@ int ext4_convert_inline_data(struct inode *inode)
if (!ext4_has_inline_data(inode)) {
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
return 0;
+ } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ /*
+ * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
+ * cleared. This means we are in the middle of moving of
+ * inline data to delay allocated block. Just force writeout
+ * here to finish conversion.
+ */
+ error = filemap_flush(inode->i_mapping);
+ if (error)
+ return error;
+ if (!ext4_has_inline_data(inode))
+ return 0;
}
needed_blocks = ext4_writepage_trans_blocks(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 13740f2d0e61..2b5ef1b64249 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,7 +41,6 @@
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>
-#include <linux/dax.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -178,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
+ ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
/*
* When journalling data dirty buffers are tracked only in the
@@ -199,8 +200,7 @@ void ext4_evict_inode(struct inode *inode)
*/
if (inode->i_ino != EXT4_JOURNAL_INO &&
ext4_should_journal_data(inode) &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
- inode->i_data.nrpages) {
+ S_ISREG(inode->i_mode) && inode->i_data.nrpages) {
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
@@ -545,12 +545,21 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
} else {
BUG();
}
+
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return retval;
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
goto found;
}
+ /*
+ * In the query cache no-wait mode, nothing we can do more if we
+ * cannot find extent in the cache.
+ */
+ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
+ return 0;
/*
* Try to see if we can get the block without requesting a new
@@ -822,7 +831,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
}
/* Maximum number of blocks we map for direct IO at once. */
@@ -837,10 +846,12 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
struct ext4_map_blocks map;
struct buffer_head *bh;
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
+ bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
int err;
ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| handle != NULL || create == 0);
+ ASSERT(create == 0 || !nowait);
map.m_lblk = block;
map.m_len = 1;
@@ -851,6 +862,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (err < 0)
return ERR_PTR(err);
+ if (nowait)
+ return sb_find_get_block(inode->i_sb, map.m_pblk);
+
bh = sb_getblk(inode->i_sb, map.m_pblk);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
@@ -1130,7 +1144,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
#endif
static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -1144,7 +1158,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- trace_ext4_write_begin(inode, pos, len, flags);
+ trace_ext4_write_begin(inode, pos, len);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
@@ -1156,7 +1170,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
- flags, pagep);
+ pagep);
if (ret < 0)
return ret;
if (ret == 1)
@@ -1171,9 +1185,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* the page (if needed) without using GFP_NOFS.
*/
retry_grab:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
+ /*
+ * The same as page allocation, we prealloc buffer heads before
+ * starting the handle.
+ */
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
+
unlock_page(page);
retry_journal:
@@ -1542,9 +1563,9 @@ struct mpage_da_data {
static void mpage_release_unused_pages(struct mpage_da_data *mpd,
bool invalidate)
{
- int nr_pages, i;
+ unsigned nr, i;
pgoff_t index, end;
- struct pagevec pvec;
+ struct folio_batch fbatch;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
@@ -1559,18 +1580,28 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
ext4_lblk_t start, last;
start = index << (PAGE_SHIFT - inode->i_blkbits);
last = end << (PAGE_SHIFT - inode->i_blkbits);
+
+ /*
+ * avoid racing with extent status tree scans made by
+ * ext4_insert_delayed_block()
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_es_remove_extent(inode, start, last - start + 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
}
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
while (index <= end) {
- nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
- if (nr_pages == 0)
+ nr = filemap_get_folios(mapping, &index, end, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct folio *folio = page_folio(page);
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
+ if (folio->index < mpd->first_page)
+ continue;
+ if (folio->index + folio_nr_pages(folio) - 1 > end)
+ continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
if (invalidate) {
@@ -1582,7 +1613,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
}
folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
}
@@ -2299,8 +2330,8 @@ out:
*/
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
- struct pagevec pvec;
- int nr_pages, i;
+ struct folio_batch fbatch;
+ unsigned nr, i;
struct inode *inode = mpd->inode;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
@@ -2314,14 +2345,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
while (start <= end) {
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
- &start, end);
- if (nr_pages == 0)
+ nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
+ if (nr == 0)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct page *page = &fbatch.folios[i]->page;
err = mpage_process_page(mpd, page, &lblk, &pblock,
&map_bh);
@@ -2337,14 +2367,14 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
if (err < 0)
goto out;
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
}
/* Extent fully mapped and matches with page boundary. We are done. */
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
out:
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
return err;
}
@@ -2931,7 +2961,7 @@ static int ext4_nonda_switch(struct super_block *sb)
}
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret, retries = 0;
@@ -2944,18 +2974,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
- ext4_verity_in_progress(inode)) {
+ if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
- len, flags, pagep, fsdata);
+ len, pagep, fsdata);
}
*fsdata = (void *)0;
- trace_ext4_da_write_begin(inode, pos, len, flags);
+ trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode,
- pos, len, flags,
+ ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
pagep, fsdata);
if (ret < 0)
return ret;
@@ -2964,7 +2992,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -3130,13 +3158,15 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
journal_t *journal;
+ sector_t ret = 0;
int err;
+ inode_lock_shared(inode);
/*
* We can get here for an inline file via the FIBMAP ioctl
*/
if (ext4_has_inline_data(inode))
- return 0;
+ goto out;
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
@@ -3175,14 +3205,19 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
jbd2_journal_unlock_updates(journal);
if (err)
- return 0;
+ goto out;
}
- return iomap_bmap(mapping, block, &ext4_iomap_ops);
+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
+
+out:
+ inode_unlock_shared(inode);
+ return ret;
}
-static int ext4_readpage(struct file *file, struct page *page)
+static int ext4_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
int ret = -EAGAIN;
struct inode *inode = page->mapping->host;
@@ -3243,19 +3278,19 @@ static void ext4_journalled_invalidate_folio(struct folio *folio,
WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}
-static int ext4_releasepage(struct page *page, gfp_t wait)
+static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_releasepage(page);
+ trace_ext4_releasepage(&folio->page);
/* Page has dirty journalled data -> cannot release */
- if (PageChecked(page))
- return 0;
+ if (folio_test_checked(folio))
+ return false;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page);
+ return jbd2_journal_try_to_free_buffers(journal, folio);
else
- return try_to_free_buffers(page);
+ return try_to_free_buffers(folio);
}
static bool ext4_inode_datasync_dirty(struct inode *inode)
@@ -3609,7 +3644,7 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
}
static const struct address_space_operations ext4_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
@@ -3618,16 +3653,16 @@ static const struct address_space_operations ext4_aops = {
.dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
- .releasepage = ext4_releasepage,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_journalled_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
@@ -3636,7 +3671,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.dirty_folio = ext4_journalled_dirty_folio,
.bmap = ext4_bmap,
.invalidate_folio = ext4_journalled_invalidate_folio,
- .releasepage = ext4_releasepage,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -3644,7 +3679,7 @@ static const struct address_space_operations ext4_journalled_aops = {
};
static const struct address_space_operations ext4_da_aops = {
- .readpage = ext4_readpage,
+ .read_folio = ext4_read_folio,
.readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
@@ -3653,9 +3688,9 @@ static const struct address_space_operations ext4_da_aops = {
.dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
.invalidate_folio = ext4_invalidate_folio,
- .releasepage = ext4_releasepage,
+ .release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
@@ -3953,27 +3988,20 @@ int ext4_break_layouts(struct inode *inode)
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ loff_t first_block_offset, last_block_offset, max_length;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
unsigned int credits;
int ret = 0, ret2 = 0;
trace_ext4_punch_hole(inode, offset, length, 0);
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
- if (ext4_has_inline_data(inode)) {
- filemap_invalidate_lock(mapping);
- ret = ext4_convert_inline_data(inode);
- filemap_invalidate_unlock(mapping);
- if (ret)
- return ret;
- }
-
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -4001,6 +4029,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ /*
+ * For punch hole the length + offset needs to be within one block
+ * before last range. Adjust the length if it goes beyond that limit.
+ */
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+ if (offset + length > max_length)
+ length = max_length - offset;
+
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
@@ -4016,6 +4052,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -4669,8 +4709,7 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
- EXT4_INODE_SIZE(inode->i_sb) &&
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
return ext4_find_inline_data_nolock(inode);
@@ -4977,7 +5016,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
- ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations;
@@ -4985,9 +5023,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(ei->i_data) - 1);
} else {
inode->i_op = &ext4_symlink_inode_operations;
- ext4_set_aops(inode);
}
- inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
@@ -5200,7 +5236,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ ext4_debug("called recursively, non-PF_MEMALLOC!\n");
dump_stack();
return -EIO;
}
@@ -5313,6 +5349,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
int error, rc = 0;
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
+ bool inc_ivers = true;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
@@ -5337,14 +5374,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error)
return error;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+ if (i_uid_needs_update(mnt_userns, attr, inode) ||
+ i_gid_needs_update(mnt_userns, attr, inode)) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
@@ -5361,7 +5398,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* counts xattr inode references.
*/
down_read(&EXT4_I(inode)->xattr_sem);
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(mnt_userns, inode, attr);
up_read(&EXT4_I(inode)->xattr_sem);
if (error) {
@@ -5370,10 +5407,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Update corresponding info in inode so that everything is in
* one transaction */
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
if (unlikely(error)) {
@@ -5384,6 +5419,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
loff_t oldsize = inode->i_size;
+ loff_t old_disksize;
int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
@@ -5397,8 +5433,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;
}
- if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
- inode_inc_iversion(inode);
+ if (attr->ia_size == inode->i_size)
+ inc_ivers = false;
if (shrink) {
if (ext4_should_order_data(inode)) {
@@ -5455,6 +5491,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
inode->i_sb->s_blocksize_bits);
down_write(&EXT4_I(inode)->i_data_sem);
+ old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
if (!error)
@@ -5466,6 +5503,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
*/
if (!error)
i_size_write(inode, attr->ia_size);
+ else
+ EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
if (error)
@@ -5497,6 +5536,8 @@ out_mmap_sem:
}
if (!error) {
+ if (inc_ivers)
+ inode_inc_iversion(inode);
setattr_copy(mnt_userns, inode, attr);
mark_inode_dirty(inode);
}
@@ -5519,6 +5560,22 @@ err_out:
return error;
}
+u32 ext4_dio_alignment(struct inode *inode)
+{
+ if (fsverity_active(inode))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ if (ext4_has_inline_data(inode))
+ return 0;
+ if (IS_ENCRYPTED(inode)) {
+ if (!fscrypt_dio_supported(inode))
+ return 0;
+ return i_blocksize(inode);
+ }
+ return 1; /* use the iomap defaults */
+}
+
int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -5534,6 +5591,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ u32 dio_align = ext4_dio_alignment(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (dio_align == 1) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+
+ /* iomap defaults */
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ } else {
+ stat->dio_mem_align = dio_align;
+ stat->dio_offset_align = dio_align;
+ }
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
@@ -5700,9 +5778,6 @@ int ext4_mark_iloc_dirty(handle_t *handle,
}
ext4_fc_track_inode(handle, inode);
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-
/* the do_update_inode consumes one bh->b_count */
get_bh(iloc->bh);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 992229ca2d83..4d49c5cfb690 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -16,11 +16,11 @@
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/random.h>
-#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/delay.h>
#include <linux/iversion.h>
#include <linux/fileattr.h>
+#include <linux/uuid.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include <linux/fsmap.h>
@@ -42,6 +42,15 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg)
memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX);
}
+/*
+ * Superblock modification callback function for changing file system
+ * UUID.
+ */
+static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg)
+{
+ memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE);
+}
+
static
int ext4_update_primary_sb(struct super_block *sb, handle_t *handle,
ext4_update_sb_callback func,
@@ -443,6 +452,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
inode->i_generation = prandom_u32();
inode_bl->i_generation = prandom_u32();
@@ -504,18 +514,6 @@ journal_err_out:
return err;
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int uuid_is_zero(__u8 u[16])
-{
- int i;
-
- for (i = 0; i < 16; i++)
- if (u[i])
- return 0;
- return 1;
-}
-#endif
-
/*
* If immutable is set and we are not clearing it, we're not allowed to change
* anything else in the inode. Don't error out if we're only trying to set
@@ -668,6 +666,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_set_inode_flags(inode, false);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -778,6 +777,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
EXT4_I(inode)->i_projid = kprojid;
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -957,7 +957,9 @@ static long ext4_ioctl_group_add(struct file *file,
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, input->group);
group_add_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
@@ -1044,7 +1046,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
__u32 flags = 0;
unsigned int flush_flags = 0;
struct super_block *sb = file_inode(filp)->i_sb;
- struct request_queue *q;
if (copy_from_user(&flags, (__u32 __user *)arg,
sizeof(__u32)))
@@ -1062,13 +1063,8 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
if (!EXT4_SB(sb)->s_journal)
return -ENODEV;
- if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID)
- return -EINVAL;
-
- q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev);
- if (!q)
- return -ENXIO;
- if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev))
return -EOPNOTSUPP;
if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
@@ -1147,6 +1143,73 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
return 0;
}
+static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
+ struct fsuuid __user *ufsuuid)
+{
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len == 0) {
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ lock_buffer(sbi->s_sbh);
+ memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
+ unlock_buffer(sbi->s_sbh);
+
+ if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ return -EFAULT;
+ return 0;
+}
+
+static int ext4_ioctl_setuuid(struct file *filp,
+ const struct fsuuid __user *ufsuuid)
+{
+ int ret = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct fsuuid fsuuid;
+ __u8 uuid[UUID_SIZE];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * If any checksums (group descriptors or metadata) are being used
+ * then the checksum seed feature is required to change the UUID.
+ */
+ if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
+ && !ext4_has_feature_csum_seed(sb))
+ || ext4_has_feature_stable_inodes(sb))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid)))
+ return -EFAULT;
+
+ if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ return -EINVAL;
+
+ if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE))
+ return -EFAULT;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid);
+ mnt_drop_write_file(filp);
+
+ return ret;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1194,6 +1257,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
@@ -1239,7 +1303,9 @@ setversion_out:
err = err2;
mnt_drop_write_file(filp);
group_extend_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, false);
+ if (err == 0)
+ err = err2;
return err;
}
@@ -1387,20 +1453,21 @@ mext_out:
err = ext4_register_li_request(sb, o_group);
resizefs_out:
- ext4_resize_end(sb);
+ err2 = ext4_resize_end(sb, true);
+ if (err == 0)
+ err = err2;
return err;
}
case FITRIM:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
/*
@@ -1432,51 +1499,9 @@ resizefs_out:
return -EOPNOTSUPP;
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- case FS_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_FS_ENCRYPTION
- int err, err2;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- handle_t *handle;
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg);
- if (!ext4_has_feature_encrypt(sb))
- return -EOPNOTSUPP;
- if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
- err = mnt_want_write_file(filp);
- if (err)
- return err;
- handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto pwsalt_err_exit;
- }
- err = ext4_journal_get_write_access(handle, sb,
- sbi->s_sbh,
- EXT4_JTR_NONE);
- if (err)
- goto pwsalt_err_journal;
- lock_buffer(sbi->s_sbh);
- generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
- ext4_superblock_csum_set(sb);
- unlock_buffer(sbi->s_sbh);
- err = ext4_handle_dirty_metadata(handle, NULL,
- sbi->s_sbh);
- pwsalt_err_journal:
- err2 = ext4_journal_stop(handle);
- if (err2 && !err)
- err = err2;
- pwsalt_err_exit:
- mnt_drop_write_file(filp);
- if (err)
- return err;
- }
- if (copy_to_user((void __user *) arg,
- sbi->s_es->s_encrypt_pw_salt, 16))
- return -EFAULT;
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
- }
case FS_IOC_GET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
@@ -1568,6 +1593,10 @@ resizefs_out:
return ext4_ioctl_setlabel(filp,
(const void __user *)arg);
+ case EXT4_IOC_GETFSUUID:
+ return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg);
+ case EXT4_IOC_SETFSUUID:
+ return ext4_ioctl_setuuid(filp, (const void __user *)arg);
default:
return -ENOTTY;
}
@@ -1645,6 +1674,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CHECKPOINT:
case FS_IOC_GETFSLABEL:
case FS_IOC_SETFSLABEL:
+ case EXT4_IOC_GETFSUUID:
+ case EXT4_IOC_SETFSUUID:
break;
default:
return -ENOIOCTLCMD;
@@ -1652,3 +1683,21 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
+
+static void set_overhead(struct ext4_super_block *es, const void *arg)
+{
+ es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg));
+}
+
+int ext4_update_overhead(struct super_block *sb, bool force)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sb_rdonly(sb))
+ return 0;
+ if (!force &&
+ (sbi->s_overhead == 0 ||
+ sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)))
+ return 0;
+ return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead);
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 252c168454c7..9dad93059945 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -140,13 +140,15 @@
* number of buddy bitmap orders possible) number of lists. Group-infos are
* placed in appropriate lists.
*
- * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
+ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
*
- * Locking: sbi->s_mb_rb_lock (rwlock)
+ * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
*
- * This is a red black tree consisting of group infos and the tree is sorted
- * by average fragment sizes (which is calculated as ext4_group_info->bb_free
- * / ext4_group_info->bb_fragments).
+ * This is an array of lists where in the i-th list there are groups with
+ * average fragment size >= 2^i and < 2^(i+1). The average fragment size
+ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
+ * Note that we don't bother with a special list for completely empty groups
+ * so we only have MB_NUM_ORDERS(sb) lists.
*
* When "mb_optimize_scan" mount option is set, mballoc consults the above data
* structures to decide the order in which groups are to be traversed for
@@ -160,7 +162,8 @@
*
* At CR = 1, we only consider groups where average fragment size > request
* size. So, we lookup a group which has average fragment size just above or
- * equal to request size using our rb tree (data structure 2) in O(log N) time.
+ * equal to request size using our average fragment size group lists (data
+ * structure 2) in O(1) time.
*
* If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
* linear order which requires O(N) search time for each CR 0 and CR 1 phase.
@@ -695,13 +698,10 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (i = 0; i < max; i++) {
if (mb_test_bit(i, buddy)) {
- /* only single bit in buddy2 may be 1 */
+ /* only single bit in buddy2 may be 0 */
if (!mb_test_bit(i << 1, buddy2)) {
MB_CHECK_ASSERT(
mb_test_bit((i<<1)+1, buddy2));
- } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
- MB_CHECK_ASSERT(
- mb_test_bit(i << 1, buddy2));
}
continue;
}
@@ -805,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
-static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
- int (*cmp)(struct rb_node *, struct rb_node *))
+static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
{
- struct rb_node **iter = &root->rb_node, *parent = NULL;
-
- while (*iter) {
- parent = *iter;
- if (cmp(new, *iter) > 0)
- iter = &((*iter)->rb_left);
- else
- iter = &((*iter)->rb_right);
- }
+ int order;
- rb_link_node(new, parent, iter);
- rb_insert_color(new, root);
-}
-
-static int
-ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
-{
- struct ext4_group_info *grp1 = rb_entry(rb1,
- struct ext4_group_info,
- bb_avg_fragment_size_rb);
- struct ext4_group_info *grp2 = rb_entry(rb2,
- struct ext4_group_info,
- bb_avg_fragment_size_rb);
- int num_frags_1, num_frags_2;
-
- num_frags_1 = grp1->bb_fragments ?
- grp1->bb_free / grp1->bb_fragments : 0;
- num_frags_2 = grp2->bb_fragments ?
- grp2->bb_free / grp2->bb_fragments : 0;
-
- return (num_frags_2 - num_frags_1);
+ /*
+ * We don't bother with a special lists groups with only 1 block free
+ * extents and for completely empty groups.
+ */
+ order = fls(len) - 2;
+ if (order < 0)
+ return 0;
+ if (order == MB_NUM_ORDERS(sb))
+ order--;
+ return order;
}
-/*
- * Reinsert grpinfo into the avg_fragment_size tree with new average
- * fragment size.
- */
+/* Move group to appropriate avg_fragment_size list */
static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int new_order;
if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
return;
- write_lock(&sbi->s_mb_rb_lock);
- if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
- rb_erase(&grp->bb_avg_fragment_size_rb,
- &sbi->s_mb_avg_fragment_size_root);
- RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
- }
+ new_order = mb_avg_fragment_size_order(sb,
+ grp->bb_free / grp->bb_fragments);
+ if (new_order == grp->bb_avg_fragment_size_order)
+ return;
- ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
- &grp->bb_avg_fragment_size_rb,
- ext4_mb_avg_fragment_size_cmp);
- write_unlock(&sbi->s_mb_rb_lock);
+ if (grp->bb_avg_fragment_size_order != -1) {
+ write_lock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
+ list_del(&grp->bb_avg_fragment_size_node);
+ write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
+ }
+ grp->bb_avg_fragment_size_order = new_order;
+ write_lock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
+ list_add_tail(&grp->bb_avg_fragment_size_node,
+ &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
+ write_unlock(&sbi->s_mb_avg_fragment_size_locks[
+ grp->bb_avg_fragment_size_order]);
}
/*
@@ -912,86 +898,55 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
*new_cr = 1;
} else {
*group = grp->bb_group;
- ac->ac_last_optimal_group = *group;
ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
}
}
/*
- * Choose next group by traversing average fragment size tree. Updates *new_cr
- * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
- * the linear search should continue for one iteration since there's lock
- * contention on the rb tree lock.
+ * Choose next group by traversing average fragment size list of suitable
+ * order. Updates *new_cr if cr level needs an update.
*/
static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- int avg_fragment_size, best_so_far;
- struct rb_node *node, *found;
- struct ext4_group_info *grp;
-
- /*
- * If there is contention on the lock, instead of waiting for the lock
- * to become available, just continue searching lineraly. We'll resume
- * our rb tree search later starting at ac->ac_last_optimal_group.
- */
- if (!read_trylock(&sbi->s_mb_rb_lock)) {
- ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
- return;
- }
+ struct ext4_group_info *grp = NULL, *iter;
+ int i;
if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
if (sbi->s_mb_stats)
atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
- /* We have found something at CR 1 in the past */
- grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
- for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
- found = rb_next(found)) {
- grp = rb_entry(found, struct ext4_group_info,
- bb_avg_fragment_size_rb);
+ }
+
+ for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
+ i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
+ continue;
+ read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
+ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
+ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+ continue;
+ }
+ list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
+ bb_avg_fragment_size_node) {
if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
- if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
+ grp = iter;
break;
- }
- goto done;
- }
-
- node = sbi->s_mb_avg_fragment_size_root.rb_node;
- best_so_far = 0;
- found = NULL;
-
- while (node) {
- grp = rb_entry(node, struct ext4_group_info,
- bb_avg_fragment_size_rb);
- avg_fragment_size = 0;
- if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
- avg_fragment_size = grp->bb_fragments ?
- grp->bb_free / grp->bb_fragments : 0;
- if (!best_so_far || avg_fragment_size < best_so_far) {
- best_so_far = avg_fragment_size;
- found = node;
}
}
- if (avg_fragment_size > ac->ac_g_ex.fe_len)
- node = node->rb_right;
- else
- node = node->rb_left;
+ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+ if (grp)
+ break;
}
-done:
- if (found) {
- grp = rb_entry(found, struct ext4_group_info,
- bb_avg_fragment_size_rb);
+ if (grp) {
*group = grp->bb_group;
ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
} else {
*new_cr = 2;
}
-
- read_unlock(&sbi->s_mb_rb_lock);
- ac->ac_last_optimal_group = *group;
}
static inline int should_optimize_scan(struct ext4_allocation_context *ac)
@@ -1020,11 +975,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
goto inc_and_return;
}
- if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
- ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
- goto inc_and_return;
- }
-
return group;
inc_and_return:
/*
@@ -1052,8 +1002,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
{
*new_cr = ac->ac_criteria;
- if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
+ if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
+ *group = next_linear_group(ac, *group, ngroups);
return;
+ }
if (*new_cr == 0) {
ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
@@ -1078,23 +1030,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
+ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
+ if (grp->bb_counters[i] > 0)
+ break;
+ /* No need to move between order lists? */
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
+ i == grp->bb_largest_free_order) {
+ grp->bb_largest_free_order = i;
+ return;
+ }
+
+ if (grp->bb_largest_free_order >= 0) {
write_lock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]);
list_del_init(&grp->bb_largest_free_order_node);
write_unlock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]);
}
- grp->bb_largest_free_order = -1; /* uninit */
-
- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
- if (grp->bb_counters[i] > 0) {
- grp->bb_largest_free_order = i;
- break;
- }
- }
- if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
- grp->bb_largest_free_order >= 0 && grp->bb_free) {
+ grp->bb_largest_free_order = i;
+ if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
write_lock(&sbi->s_mb_largest_free_orders_locks[
grp->bb_largest_free_order]);
list_add_tail(&grp->bb_largest_free_order_node,
@@ -1151,13 +1105,13 @@ void ext4_mb_generate_buddy(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_set_largest_free_order(sb, grp);
+ mb_update_avg_fragment_size(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
period = get_cycles() - period;
atomic_inc(&sbi->s_mb_buddies_generated);
atomic64_add(period, &sbi->s_mb_generation_time);
- mb_update_avg_fragment_size(sb, grp);
}
/* The buddy information is attached the buddy cache inode
@@ -1936,6 +1890,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
unsigned ret = 0;
int len0 = len;
void *buddy;
+ bool split = false;
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
@@ -1960,12 +1915,16 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain buddy itself */
while (len) {
- ord = mb_find_order_for_block(e4b, start);
+ if (!split)
+ ord = mb_find_order_for_block(e4b, start);
if (((start >> ord) << ord) == start && len >= (1 << ord)) {
/* the whole chunk may be allocated at once! */
mlen = 1 << ord;
- buddy = mb_find_buddy(e4b, ord, &max);
+ if (!split)
+ buddy = mb_find_buddy(e4b, ord, &max);
+ else
+ split = false;
BUG_ON((start >> ord) >= max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
@@ -1992,6 +1951,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
mb_clear_bit(cur + 1, buddy);
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
+ split = true;
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
@@ -2633,7 +2593,7 @@ static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t prefetch_grp = 0, ngroups, group, i;
- int cr = -1;
+ int cr = -1, new_cr;
int err = 0, first_err = 0;
unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
@@ -2704,17 +2664,14 @@ repeat:
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
- ac->ac_last_optimal_group = group;
ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
prefetch_grp = group;
- for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
- i++) {
- int ret = 0, new_cr;
+ for (i = 0, new_cr = cr; i < ngroups; i++,
+ ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
+ int ret = 0;
cond_resched();
-
- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
if (new_cr != cr) {
cr = new_cr;
goto repeat;
@@ -2919,7 +2876,7 @@ const struct seq_operations ext4_mb_seq_groups_ops = {
int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
{
- struct super_block *sb = (struct super_block *)seq->private;
+ struct super_block *sb = seq->private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
seq_puts(seq, "mballoc:\n");
@@ -2988,9 +2945,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
struct super_block *sb = pde_data(file_inode(seq->file));
unsigned long position;
- read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
-
- if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
return NULL;
position = *pos + 1;
return (void *) ((unsigned long) position);
@@ -3002,7 +2957,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof
unsigned long position;
++*pos;
- if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
+ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
return NULL;
position = *pos + 1;
return (void *) ((unsigned long) position);
@@ -3014,29 +2969,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned long position = ((unsigned long) v);
struct ext4_group_info *grp;
- struct rb_node *n;
- unsigned int count, min, max;
+ unsigned int count;
position--;
if (position >= MB_NUM_ORDERS(sb)) {
- seq_puts(seq, "fragment_size_tree:\n");
- n = rb_first(&sbi->s_mb_avg_fragment_size_root);
- if (!n) {
- seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
- return 0;
- }
- grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
- min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
- count = 1;
- while (rb_next(n)) {
- count++;
- n = rb_next(n);
- }
- grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
- max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
+ position -= MB_NUM_ORDERS(sb);
+ if (position == 0)
+ seq_puts(seq, "avg_fragment_size_lists:\n");
- seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
- min, max, count);
+ count = 0;
+ read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
+ list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
+ bb_avg_fragment_size_node)
+ count++;
+ read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
+ seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+ (unsigned int)position, count);
return 0;
}
@@ -3046,9 +2994,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "max_free_order_lists:\n");
}
count = 0;
+ read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
bb_largest_free_order_node)
count++;
+ read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
@@ -3056,11 +3006,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
}
static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
-__releases(&EXT4_SB(sb)->s_mb_rb_lock)
{
- struct super_block *sb = pde_data(file_inode(seq->file));
-
- read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
}
const struct seq_operations ext4_mb_seq_structs_summary_ops = {
@@ -3173,8 +3119,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
- RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
+ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
meta_group_info[i]->bb_group = group;
mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
@@ -3423,7 +3370,24 @@ int ext4_mb_init(struct super_block *sb)
i++;
} while (i < MB_NUM_ORDERS(sb));
- sbi->s_mb_avg_fragment_size_root = RB_ROOT;
+ sbi->s_mb_avg_fragment_size =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!sbi->s_mb_avg_fragment_size) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sbi->s_mb_avg_fragment_size_locks =
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
+ GFP_KERNEL);
+ if (!sbi->s_mb_avg_fragment_size_locks) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
+ INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
+ rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
+ }
sbi->s_mb_largest_free_orders =
kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
GFP_KERNEL);
@@ -3442,7 +3406,6 @@ int ext4_mb_init(struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
}
- rwlock_init(&sbi->s_mb_rb_lock);
spin_lock_init(&sbi->s_md_lock);
sbi->s_mb_free_pending = 0;
@@ -3498,7 +3461,7 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&lg->lg_prealloc_lock);
}
- if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
+ if (bdev_nonrot(sb->s_bdev))
sbi->s_mb_max_linear_groups = 0;
else
sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
@@ -3513,6 +3476,8 @@ out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
out:
+ kfree(sbi->s_mb_avg_fragment_size);
+ kfree(sbi->s_mb_avg_fragment_size_locks);
kfree(sbi->s_mb_largest_free_orders);
kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets);
@@ -3579,6 +3544,8 @@ int ext4_mb_release(struct super_block *sb)
kvfree(group_info);
rcu_read_unlock();
}
+ kfree(sbi->s_mb_avg_fragment_size);
+ kfree(sbi->s_mb_avg_fragment_size_locks);
kfree(sbi->s_mb_largest_free_orders);
kfree(sbi->s_mb_largest_free_orders_locks);
kfree(sbi->s_mb_offsets);
@@ -3629,7 +3596,7 @@ static inline int ext4_issue_discard(struct super_block *sb,
return __blkdev_issue_discard(sb->s_bdev,
(sector_t)discard_block << (sb->s_blocksize_bits - 9),
(sector_t)count << (sb->s_blocksize_bits - 9),
- GFP_NOFS, 0, biop);
+ GFP_NOFS, biop);
} else
return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
@@ -4107,6 +4074,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = size >> bsbits;
start = start_off >> bsbits;
+ /*
+ * For tiny groups (smaller than 8MB) the chosen allocation
+ * alignment may be larger than group size. Make sure the
+ * alignment does not move allocation to a different group which
+ * makes mballoc fail assertions later.
+ */
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -4179,7 +4155,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
}
rcu_read_unlock();
- if (start + size <= ac->ac_o_ex.fe_logical &&
+ /*
+ * In this function "start" and "size" are normalized for better
+ * alignment and length such that we could preallocate more blocks.
+ * This normalization is done such that original request of
+ * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
+ * "size" boundaries.
+ * (Note fe_len can be relaxed since FS block allocation API does not
+ * provide gurantee on number of contiguous blocks allocation since that
+ * depends upon free space left, etc).
+ * In case of inode pa, later we use the allocated blocks
+ * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated
+ * range of goal/best blocks [start, size] to put it at the
+ * ac_o_ex.fe_logical extent of this inode.
+ * (See ext4_mb_use_inode_pa() for more details)
+ */
+ if (start + size <= ac->ac_o_ex.fe_logical ||
start > ac->ac_o_ex.fe_logical) {
ext4_msg(ac->ac_sb, KERN_ERR,
"start %lu, size %lu, fe_logical %lu",
@@ -5166,6 +5157,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits = ac->ac_sb->s_blocksize_bits;
loff_t size, isize;
+ bool inode_pa_eligible, group_pa_eligible;
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
@@ -5173,25 +5165,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
+ group_pa_eligible = sbi->s_mb_group_prealloc > 0;
+ inode_pa_eligible = true;
size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
+ /* No point in using inode preallocation for closed files */
if ((size == isize) && !ext4_fs_is_busy(sbi) &&
- !inode_is_open_for_write(ac->ac_inode)) {
- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
- return;
- }
-
- if (sbi->s_mb_group_prealloc <= 0) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
- return;
- }
+ !inode_is_open_for_write(ac->ac_inode))
+ inode_pa_eligible = false;
- /* don't use group allocation for large files */
size = max(size, isize);
- if (size > sbi->s_mb_stream_request) {
- ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ /* Don't use group allocation for large files */
+ if (size > sbi->s_mb_stream_request)
+ group_pa_eligible = false;
+
+ if (!group_pa_eligible) {
+ if (inode_pa_eligible)
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ else
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -5538,6 +5532,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ int retries = 0;
u64 seq;
might_sleep();
@@ -5640,7 +5635,8 @@ repeat:
ar->len = ac->ac_b_ex.fe_len;
}
} else {
- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
+ if (++retries < 3 &&
+ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat;
/*
* If block allocation fails then the pa allocated above
@@ -5907,6 +5903,15 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks in system zone - "
+ "Block = %llu, count = %lu", block, count);
+ /* err = 0. ext4_std_error should be a no op */
+ goto error_return;
+ }
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -5923,6 +5928,8 @@ do_more:
overflow = EXT4_C2B(sbi, bit) + count -
EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
@@ -5937,7 +5944,8 @@ do_more:
goto error_return;
}
- if (!ext4_inode_block_valid(inode, block, count)) {
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
@@ -6060,6 +6068,8 @@ do_more:
block += count;
count = overflow;
put_bh(bitmap_bh);
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
goto do_more;
}
error_return:
@@ -6106,6 +6116,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
"block = %llu, count = %lu", block, count);
return;
}
+ flags |= EXT4_FREE_BLOCKS_VALIDATED;
ext4_debug("freeing block %llu\n", block);
trace_ext4_free_blocks(inode, block, count, flags);
@@ -6137,6 +6148,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
block -= overflow;
count += overflow;
}
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
overflow = EXT4_LBLK_COFF(sbi, count);
if (overflow) {
@@ -6147,6 +6160,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
return;
} else
count += sbi->s_cluster_ratio - overflow;
+ /* The range changed so it's no longer validated */
+ flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
}
if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
@@ -6398,6 +6413,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
+ * @set_trimmed: set the trimmed flag if at least one block is trimmed
*
* ext4_trim_all_free walks through group's block bitmap searching for free
* extents. When the free extent is found, mark it as used in group buddy
@@ -6407,7 +6423,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks)
+ ext4_grpblk_t minblocks, bool set_trimmed)
{
struct ext4_buddy e4b;
int ret;
@@ -6426,7 +6442,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
- if (ret >= 0)
+ if (ret >= 0 && set_trimmed)
EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
} else {
ret = 0;
@@ -6455,7 +6471,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -6463,6 +6479,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
+ bool whole_group, eof = false;
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
@@ -6475,14 +6492,16 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
range->len < sb->s_blocksize)
return -EINVAL;
/* No point to try to trim less than discard granularity */
- if (range->minlen < q->limits.discard_granularity) {
+ if (range->minlen < discard_granularity) {
minlen = EXT4_NUM_B2C(EXT4_SB(sb),
- q->limits.discard_granularity >> sb->s_blocksize_bits);
+ discard_granularity >> sb->s_blocksize_bits);
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
- if (end >= max_blks)
+ if (end >= max_blks - 1) {
end = max_blks - 1;
+ eof = true;
+ }
if (end <= first_data_blk)
goto out;
if (start < first_data_blk)
@@ -6496,6 +6515,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
/* end now represents the last cluster to discard in this group */
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ whole_group = true;
for (group = first_group; group <= last_group; group++) {
grp = ext4_get_group_info(sb, group);
@@ -6512,12 +6532,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
* change it for the last group, note that last_cluster is
* already computed earlier by ext4_get_group_no_and_offset()
*/
- if (group == last_group)
+ if (group == last_group) {
end = last_cluster;
-
+ whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ }
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen, whole_group);
if (cnt < 0) {
ret = cnt;
break;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 39da92ceabf8..dcda2a943cee 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -178,7 +178,6 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
- ext4_group_t ac_last_optimal_group;
__u32 ac_groups_considered;
__u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 7a5353a8cfd7..0a220ec9862d 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -56,8 +56,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
lb->first_pblock = 0;
return retval;
}
@@ -417,7 +416,7 @@ int ext4_ext_migrate(struct inode *inode)
struct inode *tmp_inode = NULL;
struct migrate_struct lb;
unsigned long max_entries;
- __u32 goal;
+ __u32 goal, tmp_csum_seed;
uid_t owner[2];
/*
@@ -438,7 +437,7 @@ int ext4_ext_migrate(struct inode *inode)
/*
* Worst case we can touch the allocation bitmaps and a block
- * group descriptor block. We do need need to worry about
+ * group descriptor block. We do need to worry about
* credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
@@ -465,6 +464,7 @@ int ext4_ext_migrate(struct inode *inode)
* the migration.
*/
ei = EXT4_I(inode);
+ tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -575,6 +575,7 @@ err_out:
* the inode is not visible to user space.
*/
tmp_inode->i_blocks = 0;
+ EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index cebea4270817..9af68a7ecdcf 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
@@ -127,7 +127,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
*/
static int kmmpd(void *data)
{
- struct super_block *sb = (struct super_block *) data;
+ struct super_block *sb = data;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
struct mmp_struct *mmp;
@@ -150,8 +150,6 @@ static int kmmpd(void *data)
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
- BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
- bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
@@ -372,13 +370,16 @@ skip:
EXT4_SB(sb)->s_mmp_bh = bh;
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
+ snprintf(mmp->mmp_bdevname, sizeof(mmp->mmp_bdevname),
+ "%pg", bh->b_bdev);
+
/*
* Start a kernel thread to update the MMP block periodically.
*/
EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
(int)sizeof(mmp->mmp_bdevname),
- bdevname(bh->b_bdev,
- mmp->mmp_bdevname));
+ mmp->mmp_bdevname);
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 95aa212f0863..044e34cd835c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"
@@ -31,8 +32,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
if (IS_ERR(path))
return PTR_ERR(path);
if (path[ext_depth(inode)].p_ext == NULL) {
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
*ppath = NULL;
return -ENODATA;
}
@@ -102,12 +102,10 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
from += ext4_ext_get_actual_len(ext);
- ext4_ext_drop_refs(path);
}
ret = 1;
out:
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return ret;
}
@@ -127,7 +125,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
pgoff_t index1, pgoff_t index2, struct page *page[2])
{
struct address_space *mapping[2];
- unsigned fl = AOP_FLAG_NOFS;
+ unsigned int flags;
BUG_ON(!inode1 || !inode2);
if (inode1 < inode2) {
@@ -139,11 +137,15 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
mapping[1] = inode1->i_mapping;
}
- page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
- if (!page[0])
+ flags = memalloc_nofs_save();
+ page[0] = grab_cache_page_write_begin(mapping[0], index1);
+ if (!page[0]) {
+ memalloc_nofs_restore(flags);
return -ENOMEM;
+ }
- page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
+ page[1] = grab_cache_page_write_begin(mapping[1], index2);
+ memalloc_nofs_restore(flags);
if (!page[1]) {
unlock_page(page[0]);
put_page(page[0]);
@@ -467,19 +469,17 @@ mext_check_arguments(struct inode *orig_inode,
if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
return -EPERM;
- /* Ext4 move extent does not support swapfile */
+ /* Ext4 move extent does not support swap files */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be swapfile [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -ETXTBSY;
}
if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) {
- ext4_debug("ext4 move extent: The argument files should "
- "not be quota files [ino:orig %lu, donor %lu]\n",
+ ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n",
orig_inode->i_ino, donor_inode->i_ino);
- return -EBUSY;
+ return -EOPNOTSUPP;
}
/* Ext4 move extent supports only extent based file */
@@ -626,11 +626,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
if (ret)
goto out;
ex = path[path->p_depth].p_ext;
- next_blk = ext4_ext_next_allocated_block(path);
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
/* Check hole before the start pos */
if (cur_blk + cur_len - 1 < o_start) {
+ next_blk = ext4_ext_next_allocated_block(path);
if (next_blk == EXT_MAX_BLOCKS) {
ret = -ENODATA;
goto out;
@@ -658,14 +658,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
donor_page_index = d_start >> (PAGE_SHIFT -
donor_inode->i_blkbits);
offset_in_page = o_start % blocks_per_page;
- if (cur_len > blocks_per_page- offset_in_page)
+ if (cur_len > blocks_per_page - offset_in_page)
cur_len = blocks_per_page - offset_in_page;
/*
* Up semaphore to avoid following problems:
* a. transaction deadlock among ext4_journal_start,
* ->write_begin via pagefault, and jbd2_journal_commit
- * b. racing with ->readpage, ->write_begin, and ext4_get_block
- * in move_extent_per_page
+ * b. racing with ->read_folio, ->write_begin, and
+ * ext4_get_block in move_extent_per_page
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
@@ -689,8 +689,7 @@ out:
ext4_discard_preallocations(donor_inode, 0);
}
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
unlock_two_nondirectories(orig_inode, donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e37da8d5cd0c..4183a4cb4a21 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
ext4_lblk_t *block)
{
+ struct ext4_map_blocks map;
struct buffer_head *bh;
int err;
@@ -63,21 +64,41 @@ static struct buffer_head *ext4_append(handle_t *handle,
return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ map.m_lblk = *block;
+ map.m_len = 1;
+
+ /*
+ * We're appending new directory block. Make sure the block is not
+ * allocated yet, otherwise we will end up corrupting the
+ * directory.
+ */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "Logical block already allocated");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
if (IS_ERR(bh))
return bh;
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ goto out;
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
EXT4_JTR_NONE);
- if (err) {
- brelse(bh);
- ext4_std_error(inode->i_sb, err);
- return ERR_PTR(err);
- }
+ if (err)
+ goto out;
return bh;
+
+out:
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return ERR_PTR(err);
}
static int ext4_dx_csum_verify(struct inode *inode,
@@ -110,6 +131,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
+ if (block >= inode->i_size >> inode->i_blkbits) {
+ ext4_error_inode(inode, func, line, block,
+ "Attempting to read directory block (%u) that is past i_size (%llu)",
+ block, inode->i_size);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
bh = ERR_PTR(-EIO);
else
@@ -277,9 +305,9 @@ static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct dx_hash_info *hinfo,
struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
- struct dx_map_entry map[]);
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
char *to, struct dx_map_entry *offsets,
@@ -777,12 +805,14 @@ static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
- unsigned count, indirect;
+ unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
struct dx_root *root;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
+ ext4_lblk_t block;
+ ext4_lblk_t blocks[EXT4_HTREE_LEVEL];
memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
@@ -854,6 +884,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
dxtrace(printk("Look up %x", hash));
+ level = 0;
+ blocks[0] = 0;
while (1) {
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
@@ -882,15 +914,27 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
- if (!indirect--)
+
+ block = dx_get_block(at);
+ for (i = 0; i <= level; i++) {
+ if (blocks[i] == block) {
+ ext4_warning_inode(dir,
+ "dx entry: tree cycle block %u points back to block %u",
+ blocks[level], block);
+ goto fail;
+ }
+ }
+ if (++level > indirect)
return frame;
+ blocks[level] = block;
frame++;
- frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ frame->bh = ext4_read_dirblock(dir, block, INDEX);
if (IS_ERR(frame->bh)) {
ret_err = (struct dx_frame *) frame->bh;
frame->bh = NULL;
goto fail;
}
+
entries = ((struct dx_node *) frame->bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit(dir)) {
@@ -1249,15 +1293,23 @@ static inline int search_dirblock(struct buffer_head *bh,
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
{
int count = 0;
- char *base = (char *) de;
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data;
+ unsigned int buflen = bh->b_size;
+ char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + blocksize) {
+ if (ext4_has_metadata_csum(dir->i_sb))
+ buflen -= sizeof(struct ext4_dir_entry_tail);
+
+ while ((char *) de < base + buflen) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen,
+ ((char *)de) - base))
+ return -EFSCORRUPTED;
if (de->name_len && de->inode) {
if (ext4_hash_in_dirent(dir))
h.hash = EXT4_DIRENT_HASH(de);
@@ -1270,8 +1322,7 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
count++;
cond_resched();
}
- /* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext4_next_entry(de, blocksize);
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
}
return count;
}
@@ -1466,10 +1517,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
@@ -1906,7 +1957,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
struct dx_hash_info *hinfo)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
+ unsigned continued;
+ int count;
struct buffer_head *bh2;
ext4_lblk_t newblock;
u32 hash2;
@@ -1943,8 +1995,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
- blocksize, hinfo, map);
+ count = dx_make_map(dir, *bh, hinfo, map);
+ if (count < 0) {
+ err = count;
+ goto journal_error;
+ }
map -= count;
dx_sort_map(map, count);
/* Ensure that neither split block is over half full */
@@ -2031,7 +2086,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
unsigned int offset = 0;
char *top;
- de = (struct ext4_dir_entry_2 *)buf;
+ de = buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -2587,7 +2642,7 @@ int ext4_generic_delete_entry(struct inode *dir,
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *)entry_buf;
+ de = entry_buf;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
entry_buf, buf_size, i))
@@ -3040,11 +3095,8 @@ bool ext4_empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
- bh->b_data, bh->b_size, offset)) {
- offset = (offset | (sb->s_blocksize - 1)) + 1;
- continue;
- }
- if (le32_to_cpu(de->inode)) {
+ bh->b_data, bh->b_size, offset) ||
+ le32_to_cpu(de->inode)) {
brelse(bh);
return false;
}
@@ -3249,6 +3301,32 @@ out_trace:
return retval;
}
+static int ext4_init_symlink_block(handle_t *handle, struct inode *inode,
+ struct fscrypt_str *disk_link)
+{
+ struct buffer_head *bh;
+ char *kaddr;
+ int err = 0;
+
+ bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ kaddr = (char *)bh->b_data;
+ memcpy(kaddr, disk_link->name, disk_link->len);
+ inode->i_size = disk_link->len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+out:
+ brelse(bh);
+ return err;
+}
+
static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, const char *symname)
{
@@ -3257,6 +3335,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
int err, len = strlen(symname);
int credits;
struct fscrypt_str disk_link;
+ int retries = 0;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
return -EIO;
@@ -3270,26 +3349,15 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- /*
- * For non-fast symlinks, we just allocate inode and put it on
- * orphan list in the first transaction => we need bitmap,
- * group descriptor, sb, inode block, quota blocks, and
- * possibly selinux xattr blocks.
- */
- credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
- EXT4_XATTR_TRANS_BLOCKS;
- } else {
- /*
- * Fast symlink. We have to add entry to directory
- * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
- * allocate new inode (bitmap, group descriptor, inode block,
- * quota blocks, sb is already counted in previous macros).
- */
- credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
- }
-
+ /*
+ * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the
+ * directory. +3 for inode, inode bitmap, group descriptor allocation.
+ * EXT4_DATA_TRANS_BLOCKS for the data block allocation and
+ * modification.
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+retry:
inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO,
&dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
@@ -3297,7 +3365,8 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(inode)) {
if (handle)
ext4_journal_stop(handle);
- return PTR_ERR(inode);
+ err = PTR_ERR(inode);
+ goto out_retry;
}
if (IS_ENCRYPTED(inode)) {
@@ -3305,75 +3374,44 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto err_drop_inode;
inode->i_op = &ext4_encrypted_symlink_inode_operations;
+ } else {
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
+ inode->i_op = &ext4_symlink_inode_operations;
+ } else {
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ inode->i_link = (char *)&EXT4_I(inode)->i_data;
+ }
}
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
- if (!IS_ENCRYPTED(inode))
- inode->i_op = &ext4_symlink_inode_operations;
- inode_nohighmem(inode);
- ext4_set_aops(inode);
- /*
- * We cannot call page_symlink() with transaction started
- * because it calls into ext4_write_begin() which can wait
- * for transaction commit if we are running out of space
- * and thus we deadlock. So we have to stop transaction now
- * and restart it when symlink contents is written.
- *
- * To keep fs consistent in case of crash, we have to put inode
- * to orphan list in the mean time.
- */
- drop_nlink(inode);
- err = ext4_orphan_add(handle, inode);
- if (handle)
- ext4_journal_stop(handle);
- handle = NULL;
- if (err)
- goto err_drop_inode;
- err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
- if (err)
- goto err_drop_inode;
- /*
- * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
- * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
- */
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- handle = NULL;
- goto err_drop_inode;
- }
- set_nlink(inode, 1);
- err = ext4_orphan_del(handle, inode);
+ /* alloc symlink block and fill it */
+ err = ext4_init_symlink_block(handle, inode, &disk_link);
if (err)
goto err_drop_inode;
} else {
/* clear the extent format for fast symlink */
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
- if (!IS_ENCRYPTED(inode)) {
- inode->i_op = &ext4_fast_symlink_inode_operations;
- inode->i_link = (char *)&EXT4_I(inode)->i_data;
- }
memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
disk_link.len);
inode->i_size = disk_link.len - 1;
+ EXT4_I(inode)->i_disksize = inode->i_size;
}
- EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
- if (inode)
- iput(inode);
- goto out_free_encrypted_link;
+ iput(inode);
+ goto out_retry;
err_drop_inode:
- if (handle)
- ext4_journal_stop(handle);
clear_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
+ if (handle)
+ ext4_journal_stop(handle);
iput(inode);
-out_free_encrypted_link:
+out_retry:
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
if (disk_link.name != (unsigned char *)symname)
kfree(disk_link.name);
return err;
@@ -3455,6 +3493,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
+ struct ext4_dir_entry_2 *de;
+ unsigned int offset;
+
/* The first directory block must not be a hole, so
* treat it as DIRENT_HTREE
*/
@@ -3463,9 +3504,30 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
*retval = PTR_ERR(bh);
return NULL;
}
- *parent_de = ext4_next_entry(
- (struct ext4_dir_entry_2 *)bh->b_data,
- inode->i_sb->s_blocksize);
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino ||
+ strcmp(".", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '.'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len,
+ inode->i_sb->s_blocksize);
+ de = ext4_next_entry(de, inode->i_sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, offset) ||
+ le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '..'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ *parent_de = de;
+
return bh;
}
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 7de0612eb42d..69a9cf9137a6 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -181,8 +181,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
} else
brelse(iloc.bh);
- jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
- jbd_debug(4, "orphan inode %lu will point to %d\n",
+ ext4_debug("superblock will point to %lu\n", inode->i_ino);
+ ext4_debug("orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
out:
ext4_std_error(sb, err);
@@ -251,7 +251,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
}
mutex_lock(&sbi->s_orphan_lock);
- jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+ ext4_debug("remove inode %lu from orphan list\n", inode->i_ino);
prev = ei->i_orphan.prev;
list_del_init(&ei->i_orphan);
@@ -267,7 +267,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
ino_next = NEXT_ORPHAN(inode);
if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %u\n", ino_next);
+ ext4_debug("superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode->i_sb,
sbi->s_sbh, EXT4_JTR_NONE);
@@ -286,7 +286,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct inode *i_prev =
&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
- jbd_debug(4, "orphan inode %lu will point to %u\n",
+ ext4_debug("orphan inode %lu will point to %u\n",
i_prev->i_ino, ino_next);
err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
if (err) {
@@ -332,8 +332,8 @@ static void ext4_process_orphan(struct inode *inode,
ext4_msg(sb, KERN_DEBUG,
"%s: truncating inode %lu to %lld bytes",
__func__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %lld bytes\n",
- inode->i_ino, inode->i_size);
+ ext4_debug("truncating inode %lu to %lld bytes\n",
+ inode->i_ino, inode->i_size);
inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ret = ext4_truncate(inode);
@@ -353,8 +353,8 @@ static void ext4_process_orphan(struct inode *inode,
ext4_msg(sb, KERN_DEBUG,
"%s: deleting unreferenced inode %lu",
__func__, inode->i_ino);
- jbd_debug(2, "deleting unreferenced inode %lu\n",
- inode->i_ino);
+ ext4_debug("deleting unreferenced inode %lu\n",
+ inode->i_ino);
(*nr_orphans)++;
}
iput(inode); /* The delete magic happens here! */
@@ -391,7 +391,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
if (!es->s_last_orphan && !oi->of_blocks) {
- jbd_debug(4, "no orphan inodes to clean up\n");
+ ext4_debug("no orphan inodes to clean up\n");
return;
}
@@ -415,7 +415,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
"clearing orphan list.\n");
es->s_last_orphan = 0;
}
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
return;
}
@@ -459,7 +459,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
* so, skip the rest.
*/
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ ext4_debug("Skipping orphan recovery on fs with errors.\n");
es->s_last_orphan = 0;
break;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 495ce59fb4ad..97fa7b4c645f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -134,8 +134,10 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
@@ -463,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
- * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * end_page_writeback() cannot be called from ext4_end_bio() when IO
* on the first buffer finishes and we are still working on submitting
* the second buffer.
*/
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index af491e170c4a..3d21eae267fc 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -75,7 +75,7 @@ static void __read_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, iter_all) {
page = bv->bv_page;
- /* PG_error was set if any post_read step failed */
+ /* PG_error was set if verity failed. */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
/* will re-read again later */
@@ -96,10 +96,12 @@ static void decrypt_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
- fscrypt_decrypt_bio(ctx->bio);
-
- bio_post_read_processing(ctx);
+ if (fscrypt_decrypt_bio(bio))
+ bio_post_read_processing(ctx);
+ else
+ __read_end_io(bio);
}
static void verity_work(struct work_struct *work)
@@ -163,7 +165,7 @@ static bool bio_post_read_required(struct bio *bio)
*
* The mpage code never puts partial pages into a BIO (except for end-of-file).
* If a page does not map to a contiguous run of blocks then it simply falls
- * back to block_read_full_page().
+ * back to block_read_full_folio().
*
* Why is this? If a page's completion depends on a number of different BIOs
* which can complete in any order (or at the same time) then determining the
@@ -394,7 +396,7 @@ int ext4_mpage_readpages(struct inode *inode,
bio = NULL;
}
if (!PageUptodate(page))
- block_read_full_page(page, ext4_get_block);
+ block_read_full_folio(page_folio(page), ext4_get_block);
else
unlock_page(page);
next_page:
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 90a941d20dff..6dfe9ccae0c5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -54,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
/*
+ * If the reserved GDT blocks is non-zero, the resize_inode feature
+ * should always be set.
+ */
+ if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+ !ext4_has_feature_resize_inode(sb)) {
+ ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
+ return -EFSCORRUPTED;
+ }
+
+ /*
* If we are not using the primary superblock/GDT copy don't resize,
* because the user tools have no way of handling this. Probably a
* bad time to do it anyways.
@@ -87,10 +97,13 @@ int ext4_resize_begin(struct super_block *sb)
return ret;
}
-void ext4_resize_end(struct super_block *sb)
+int ext4_resize_end(struct super_block *sb, bool update_backups)
{
clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags);
smp_mb__after_atomic();
+ if (update_backups)
+ return ext4_update_overhead(sb, true);
+ return 0;
}
static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
@@ -1370,6 +1383,17 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
return err;
}
+static void ext4_add_overhead(struct super_block *sb,
+ const ext4_fsblk_t overhead)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+
+ sbi->s_overhead += overhead;
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ smp_wmb();
+}
+
/*
* ext4_update_super() updates the super block so that the newly added
* groups can be seen by the filesystem.
@@ -1471,9 +1495,18 @@ static void ext4_update_super(struct super_block *sb,
}
/*
- * Update the fs overhead information
+ * Update the fs overhead information.
+ *
+ * For bigalloc, if the superblock already has a properly calculated
+ * overhead, update it with a value based on numbers already computed
+ * above for the newly allocated capacity.
*/
- ext4_calculate_overhead(sb);
+ if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0))
+ ext4_add_overhead(sb,
+ EXT4_NUM_B2C(sbi, blocks_count - free_blocks));
+ else
+ ext4_calculate_overhead(sb);
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
@@ -1978,6 +2011,16 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
}
brelse(bh);
+ /*
+ * For bigalloc, trim the requested size to the nearest cluster
+ * boundary to avoid creating an unusable filesystem. We do this
+ * silently, instead of returning an error, to avoid breaking
+ * callers that blindly resize the filesystem to the full size of
+ * the underlying block device.
+ */
+ if (ext4_has_feature_bigalloc(sb))
+ n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1);
+
retry:
o_blocks_count = ext4_blocks_count(es);
@@ -2079,7 +2122,7 @@ retry:
goto out;
}
- if (ext4_blocks_count(es) == n_blocks_count)
+ if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0)
goto out;
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 81749eaddf4c..d733db8a0b02 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -87,7 +87,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
static int ext4_validate_options(struct fs_context *fc);
static int ext4_check_opt_consistency(struct fs_context *fc,
struct super_block *sb);
-static int ext4_apply_options(struct fs_context *fc, struct super_block *sb);
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
@@ -159,7 +159,7 @@ MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
-static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
+static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
bh_end_io_t *end_io)
{
/*
@@ -171,10 +171,10 @@ static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
get_bh(bh);
- submit_bh(REQ_OP_READ, op_flags, bh);
+ submit_bh(REQ_OP_READ | op_flags, bh);
}
-void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
+void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
bh_end_io_t *end_io)
{
BUG_ON(!buffer_locked(bh));
@@ -186,7 +186,7 @@ void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
__ext4_read_bh(bh, op_flags, end_io);
}
-int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io)
+int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
{
BUG_ON(!buffer_locked(bh));
@@ -203,21 +203,14 @@ int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io)
return -EIO;
}
-int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
+int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
- if (trylock_buffer(bh)) {
- if (wait)
- return ext4_read_bh(bh, op_flags, NULL);
+ lock_buffer(bh);
+ if (!wait) {
ext4_read_bh_nowait(bh, op_flags, NULL);
return 0;
}
- if (wait) {
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
- }
- return 0;
+ return ext4_read_bh(bh, op_flags, NULL);
}
/*
@@ -227,8 +220,8 @@ int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
* return.
*/
static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
- sector_t block, int op_flags,
- gfp_t gfp)
+ sector_t block,
+ blk_opf_t op_flags, gfp_t gfp)
{
struct buffer_head *bh;
int ret;
@@ -248,7 +241,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
}
struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
- int op_flags)
+ blk_opf_t op_flags)
{
return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
}
@@ -264,7 +257,8 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
if (likely(bh)) {
- ext4_read_bh_lock(bh, REQ_RAHEAD, false);
+ if (trylock_buffer(bh))
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
brelse(bh);
}
}
@@ -1199,20 +1193,28 @@ static void ext4_put_super(struct super_block *sb)
int aborted = 0;
int i, err;
- ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
-
- flush_work(&sbi->s_error_work);
- destroy_workqueue(sbi->rsv_conversion_wq);
- ext4_release_orphan_info(sb);
-
/*
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
+ * Unregister sysfs before flush sbi->s_error_work.
+ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
+ * read metadata verify failed then will queue error work.
+ * flush_stashed_error_work will call start_this_handle may trigger
+ * BUG_ON.
*/
ext4_unregister_sysfs(sb);
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
+ ext4_msg(sb, KERN_INFO, "unmounting filesystem.");
+
+ ext4_unregister_li_request(sb);
+ ext4_quota_off_umount(sb);
+
+ flush_work(&sbi->s_error_work);
+ destroy_workqueue(sbi->rsv_conversion_wq);
+ ext4_release_orphan_info(sb);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
@@ -1299,7 +1301,7 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
@@ -1392,7 +1394,7 @@ static void ext4_destroy_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+ struct ext4_inode_info *ei = foo;
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
@@ -1487,128 +1489,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode)
return ext4_write_inode(inode, &wbc);
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
-{
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
-}
-
-static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
- void *fs_data)
-{
- handle_t *handle = fs_data;
- int res, res2, credits, retries = 0;
-
- /*
- * Encrypting the root directory is not allowed because e2fsck expects
- * lost+found to exist and be unencrypted, and encrypting the root
- * directory would imply encrypting the lost+found directory as well as
- * the filename "lost+found" itself.
- */
- if (inode->i_ino == EXT4_ROOT_INO)
- return -EPERM;
-
- if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
- return -EINVAL;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
- return -EOPNOTSUPP;
-
- res = ext4_convert_inline_data(inode);
- if (res)
- return res;
-
- /*
- * If a journal handle was specified, then the encryption context is
- * being set on a new inode via inheritance and is part of a larger
- * transaction to create the inode. Otherwise the encryption context is
- * being set on an existing inode in its own transaction. Only in the
- * latter case should the "retry on ENOSPC" logic be used.
- */
-
- if (handle) {
- res = ext4_xattr_set_handle(handle, inode,
- EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- ext4_clear_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode, false);
- }
- return res;
- }
-
- res = dquot_initialize(inode);
- if (res)
- return res;
-retry:
- res = ext4_xattr_set_credits(inode, len, false /* is_create */,
- &credits);
- if (res)
- return res;
-
- handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
- ctx, len, 0);
- if (!res) {
- ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
- /*
- * Update inode->i_flags - S_ENCRYPTED will be enabled,
- * S_DAX may be disabled
- */
- ext4_set_inode_flags(inode, false);
- res = ext4_mark_inode_dirty(handle, inode);
- if (res)
- EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
- }
- res2 = ext4_journal_stop(handle);
-
- if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- if (!res)
- res = res2;
- return res;
-}
-
-static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
-{
- return EXT4_SB(sb)->s_dummy_enc_policy.policy;
-}
-
-static bool ext4_has_stable_inodes(struct super_block *sb)
-{
- return ext4_has_feature_stable_inodes(sb);
-}
-
-static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
- int *ino_bits_ret, int *lblk_bits_ret)
-{
- *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
- *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
-}
-
-static const struct fscrypt_operations ext4_cryptops = {
- .key_prefix = "ext4:",
- .get_context = ext4_get_context,
- .set_context = ext4_set_context,
- .get_dummy_policy = ext4_get_dummy_policy,
- .empty_dir = ext4_empty_dir,
- .has_stable_inodes = ext4_has_stable_inodes,
- .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
-};
-#endif
-
#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
@@ -1690,7 +1570,7 @@ enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb,
Opt_nouid32, Opt_debug, Opt_removed,
- Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_user_xattr, Opt_acl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
@@ -1699,7 +1579,7 @@ enum {
Opt_inlinecrypt,
Opt_usrjquota, Opt_grpjquota, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota,
Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
@@ -1776,9 +1656,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("oldalloc", Opt_removed),
fsparam_flag ("orlov", Opt_removed),
fsparam_flag ("user_xattr", Opt_user_xattr),
- fsparam_flag ("nouser_xattr", Opt_nouser_xattr),
fsparam_flag ("acl", Opt_acl),
- fsparam_flag ("noacl", Opt_noacl),
fsparam_flag ("norecovery", Opt_noload),
fsparam_flag ("noload", Opt_noload),
fsparam_flag ("bh", Opt_removed),
@@ -1808,7 +1686,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("barrier", Opt_barrier),
fsparam_u32 ("barrier", Opt_barrier),
fsparam_flag ("nobarrier", Opt_nobarrier),
- fsparam_flag ("i_version", Opt_i_version),
+ fsparam_flag ("i_version", Opt_removed),
fsparam_flag ("dax", Opt_dax),
fsparam_enum ("dax", Opt_dax_type, ext4_param_dax),
fsparam_u32 ("stripe", Opt_stripe),
@@ -1862,7 +1740,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
};
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-#define DEFAULT_MB_OPTIMIZE_SCAN (-1)
static const char deprecated_msg[] =
"Mount option \"%s\" will be removed by %s\n"
@@ -1908,6 +1785,7 @@ static const struct mount_opts {
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_NO_EXT2},
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
@@ -1928,13 +1806,10 @@ static const struct mount_opts {
{Opt_journal_ioprio, 0, MOPT_NO_EXT2},
{Opt_data, 0, MOPT_NO_EXT2},
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
- {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
- {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
#else
{Opt_acl, 0, MOPT_NOSUPPORT},
- {Opt_noacl, 0, MOPT_NOSUPPORT},
#endif
{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
@@ -1984,31 +1859,12 @@ ext4_sb_read_encoding(const struct ext4_super_block *es)
}
#endif
-static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
-{
-#ifdef CONFIG_FS_ENCRYPTION
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
-
- err = fscrypt_set_test_dummy_encryption(sb, arg,
- &sbi->s_dummy_enc_policy);
- if (err) {
- ext4_msg(sb, KERN_WARNING,
- "Error while setting test dummy encryption [%d]", err);
- return err;
- }
- ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
-#endif
- return 0;
-}
-
#define EXT4_SPEC_JQUOTA (1 << 0)
#define EXT4_SPEC_JQFMT (1 << 1)
#define EXT4_SPEC_DATAJ (1 << 2)
#define EXT4_SPEC_SB_BLOCK (1 << 3)
#define EXT4_SPEC_JOURNAL_DEV (1 << 4)
#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5)
-#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6)
#define EXT4_SPEC_s_want_extra_isize (1 << 7)
#define EXT4_SPEC_s_max_batch_time (1 << 8)
#define EXT4_SPEC_s_min_batch_time (1 << 9)
@@ -2025,7 +1881,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
struct ext4_fs_context {
char *s_qf_names[EXT4_MAXQUOTAS];
- char *test_dummy_enc_arg;
+ struct fscrypt_dummy_policy dummy_enc_policy;
int s_jquota_fmt; /* Format of quota to use */
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
@@ -2067,7 +1923,7 @@ static void ext4_fc_free(struct fs_context *fc)
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(ctx->s_qf_names[i]);
- kfree(ctx->test_dummy_enc_arg);
+ fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
kfree(ctx);
}
@@ -2143,6 +1999,29 @@ static int unnote_qf_name(struct fs_context *fc, int qtype)
}
#endif
+static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct ext4_fs_context *ctx)
+{
+ int err;
+
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption option not supported");
+ return -EINVAL;
+ }
+ err = fscrypt_parse_test_dummy_encryption(param,
+ &ctx->dummy_enc_policy);
+ if (err == -EINVAL) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", param->key);
+ } else if (err == -EEXIST) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ return err;
+}
+
#define EXT4_SET_CTX(name) \
static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
unsigned long flag) \
@@ -2230,10 +2109,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
else
return note_qf_name(fc, GRPQUOTA, param);
#endif
- case Opt_noacl:
- case Opt_nouser_xattr:
- ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "3.5");
- break;
case Opt_sb:
if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
ext4_msg(NULL, KERN_WARNING,
@@ -2250,11 +2125,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_abort:
ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
return 0;
- case Opt_i_version:
- ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20");
- ext4_msg(NULL, KERN_WARNING, "Use iversion instead\n");
- ctx_set_flags(ctx, SB_I_VERSION);
- return 0;
case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
ctx_set_flags(ctx, SB_INLINECRYPT);
@@ -2405,28 +2275,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
return 0;
case Opt_test_dummy_encryption:
-#ifdef CONFIG_FS_ENCRYPTION
- if (param->type == fs_value_is_flag) {
- ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
- ctx->test_dummy_enc_arg = NULL;
- return 0;
- }
- if (*param->string &&
- !(!strcmp(param->string, "v1") ||
- !strcmp(param->string, "v2"))) {
- ext4_msg(NULL, KERN_WARNING,
- "Value of option \"%s\" is unrecognized",
- param->key);
- return -EINVAL;
- }
- ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
- ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size,
- GFP_KERNEL);
-#else
- ext4_msg(NULL, KERN_WARNING,
- "Test dummy encryption mount option ignored");
-#endif
- return 0;
+ return ext4_parse_test_dummy_encryption(param, ctx);
case Opt_dax:
case Opt_dax_type:
#ifdef CONFIG_FS_DAX
@@ -2617,11 +2466,14 @@ parse_failed:
if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
m_ctx->journal_ioprio = s_ctx->journal_ioprio;
- ret = ext4_apply_options(fc, sb);
+ ext4_apply_options(fc, sb);
+ ret = 0;
out_free:
- kfree(s_ctx);
- kfree(fc);
+ if (fc) {
+ ext4_fc_free(fc);
+ kfree(fc);
+ }
kfree(s_mount_opts);
return ret;
}
@@ -2781,12 +2633,76 @@ err_jquota_specified:
#endif
}
+static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
+ struct super_block *sb)
+{
+ const struct ext4_fs_context *ctx = fc->fs_private;
+ const struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
+ return 0;
+
+ if (!ext4_has_feature_encrypt(sb)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption requires encrypt feature");
+ return -EINVAL;
+ }
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Can't set or change test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+ /* Also make sure s_mount_opts didn't contain a conflicting value. */
+ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ /*
+ * fscrypt_add_test_dummy_key() technically changes the super_block, so
+ * technically it should be delayed until ext4_apply_options() like the
+ * other changes. But since we never get here for remounts (see above),
+ * and this is the last chance to report errors, we do it here.
+ */
+ err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy);
+ if (err)
+ ext4_msg(NULL, KERN_WARNING,
+ "Error adding test dummy encryption key [%d]", err);
+ return err;
+}
+
+static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
+ struct super_block *sb)
+{
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
+ /* if already set, it was already verified to be the same */
+ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
+ return;
+ EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
+ memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
+}
+
static int ext4_check_opt_consistency(struct fs_context *fc,
struct super_block *sb)
{
struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_sb_info *sbi = fc->s_fs_info;
int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
+ int err;
if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
ext4_msg(NULL, KERN_ERR,
@@ -2816,20 +2732,9 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
"for blocksize < PAGE_SIZE");
}
-#ifdef CONFIG_FS_ENCRYPTION
- /*
- * This mount option is just for testing, and it's not worthwhile to
- * implement the extra complexity (e.g. RCU protection) that would be
- * needed to allow it to be set or changed during remount. We do allow
- * it to be specified during remount, but only if there is no change.
- */
- if ((ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) &&
- is_remount && !sbi->s_dummy_enc_policy.policy) {
- ext4_msg(NULL, KERN_WARNING,
- "Can't set test_dummy_encryption on remount");
- return -1;
- }
-#endif
+ err = ext4_check_test_dummy_encryption(fc, sb);
+ if (err)
+ return err;
if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
if (!sbi->s_journal) {
@@ -2875,11 +2780,10 @@ fail_dax_change_remount:
return ext4_check_quota_consistency(fc, sb);
}
-static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
{
struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_sb_info *sbi = fc->s_fs_info;
- int ret = 0;
sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
sbi->s_mount_opt |= ctx->vals_s_mount_opt;
@@ -2890,14 +2794,6 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
sb->s_flags &= ~ctx->mask_s_flags;
sb->s_flags |= ctx->vals_s_flags;
- /*
- * i_version differs from common mount option iversion so we have
- * to let vfs know that it was set, otherwise it would get cleared
- * on remount
- */
- if (ctx->mask_s_flags & SB_I_VERSION)
- fc->sb_flags |= SB_I_VERSION;
-
#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
APPLY(s_commit_interval);
APPLY(s_stripe);
@@ -2915,11 +2811,7 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
#endif
ext4_apply_quota_options(fc, sb);
-
- if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)
- ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg);
-
- return ret;
+ ext4_apply_test_dummy_encryption(ctx, sb);
}
@@ -3050,8 +2942,6 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
- if (sb->s_flags & SB_I_VERSION)
- SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -3091,6 +2981,15 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
+
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=0");
+ } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
+ test_opt2(sb, MB_OPTIMIZE_SCAN)) {
+ SEQ_OPTS_PUTS("mb_optimize_scan=1");
+ }
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3832,12 +3731,13 @@ static struct task_struct *ext4_lazyinit_task;
*/
static int ext4_lazyinit_thread(void *arg)
{
- struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+ struct ext4_lazy_init *eli = arg;
struct list_head *pos, *n;
struct ext4_li_request *elr;
unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
+ set_freezable();
cont_thread:
while (true) {
@@ -4053,9 +3953,9 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
- (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE)))
+ if (sb_rdonly(sb) ||
+ (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -4172,9 +4072,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -4350,7 +4252,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi)
return;
kfree(sbi->s_blockgroup_lock);
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
}
@@ -4362,7 +4264,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
if (!sbi)
return NULL;
- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
+ NULL, NULL);
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
@@ -4374,116 +4277,15 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
sbi->s_sb = sb;
return sbi;
err_out:
- fs_put_dax(sbi->s_daxdev);
+ fs_put_dax(sbi->s_daxdev, NULL);
kfree(sbi);
return NULL;
}
-static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+static void ext4_set_def_opts(struct super_block *sb,
+ struct ext4_super_block *es)
{
- struct buffer_head *bh, **group_desc;
- struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct flex_groups **flex_groups;
- ext4_fsblk_t block;
- ext4_fsblk_t logical_sb_block;
- unsigned long offset = 0;
unsigned long def_mount_opts;
- struct inode *root;
- int ret = -ENOMEM;
- int blocksize, clustersize;
- unsigned int db_count;
- unsigned int i;
- int needs_recovery, has_huge_files;
- __u64 blocks_count;
- int err = 0;
- ext4_group_t first_not_zeroed;
- struct ext4_fs_context *ctx = fc->fs_private;
- int silent = fc->sb_flags & SB_SILENT;
-
- /* Set defaults for the variables that will be set during parsing */
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-
- sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
- sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
-
- /* -EINVAL is default */
- ret = -EINVAL;
- blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
- if (!blocksize) {
- ext4_msg(sb, KERN_ERR, "unable to set blocksize");
- goto out_fail;
- }
-
- /*
- * The ext4 superblock will not be buffer aligned for other than 1kB
- * block sizes. We need to calculate the offset from buffer start.
- */
- if (blocksize != EXT4_MIN_BLOCK_SIZE) {
- logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- } else {
- logical_sb_block = sbi->s_sb_block;
- }
-
- bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR, "unable to read superblock");
- ret = PTR_ERR(bh);
- goto out_fail;
- }
- /*
- * Note: s_es must be initialized as soon as possible because
- * some ext4 macro-instructions depend on its value
- */
- es = (struct ext4_super_block *) (bh->b_data + offset);
- sbi->s_es = es;
- sb->s_magic = le16_to_cpu(es->s_magic);
- if (sb->s_magic != EXT4_SUPER_MAGIC)
- goto cantfind_ext4;
- sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
-
- /* Warn if metadata_csum and gdt_csum are both set. */
- if (ext4_has_feature_metadata_csum(sb) &&
- ext4_has_feature_gdt_csum(sb))
- ext4_warning(sb, "metadata_csum and uninit_bg are "
- "redundant flags; please run fsck.");
-
- /* Check for a known checksum algorithm */
- if (!ext4_verify_csum_type(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "unknown checksum algorithm.");
- silent = 1;
- goto cantfind_ext4;
- }
- ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
- ext4_orphan_file_block_trigger);
-
- /* Load the checksum driver */
- sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(sbi->s_chksum_driver)) {
- ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
- ret = PTR_ERR(sbi->s_chksum_driver);
- sbi->s_chksum_driver = NULL;
- goto failed_mount;
- }
-
- /* Check superblock checksum */
- if (!ext4_superblock_csum_verify(sb, es)) {
- ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
- "invalid superblock checksum. Run e2fsck?");
- silent = 1;
- ret = -EFSBADCRC;
- goto cantfind_ext4;
- }
-
- /* Precompute checksum seed for all metadata */
- if (ext4_has_feature_csum_seed(sb))
- sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
- sizeof(es->s_uuid));
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -4512,9 +4314,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
set_opt(sb, WRITEBACK_DATA);
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+ if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
set_opt(sb, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+ else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
set_opt(sb, ERRORS_CONT);
else
set_opt(sb, ERRORS_RO);
@@ -4523,12 +4325,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
- sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
- sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
- sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-
if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
set_opt(sb, BARRIER);
@@ -4540,31 +4336,96 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- /*
- * set default s_li_wait_mult for lazyinit, for the case there is
- * no mount option specified.
- */
- sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ if (sb->s_blocksize == PAGE_SIZE)
+ set_opt(sb, DIOREAD_NOLOCK);
+}
- if (le32_to_cpu(es->s_log_block_size) >
- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log block size: %u",
- le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
- if (le32_to_cpu(es->s_log_cluster_size) >
- (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log cluster size: %u",
- le32_to_cpu(es->s_log_cluster_size));
- goto failed_mount;
+static int ext4_handle_clustersize(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int clustersize;
+
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ if (ext4_has_feature_bigalloc(sb)) {
+ if (clustersize < sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ return -EINVAL;
+ }
+ } else {
+ if (clustersize != sb->s_blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "fragment/cluster size (%d) != "
+ "block size (%lu)", clustersize, sb->s_blocksize);
+ return -EINVAL;
+ }
+ if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ return -EINVAL;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
}
+ sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
- blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ /* Do we have standard group size of clustersize * 8 blocks ? */
+ if (sbi->s_blocks_per_group == clustersize << 3)
+ set_opt2(sb, STD_GROUP_SIZE);
- if (blocksize == PAGE_SIZE)
- set_opt(sb, DIOREAD_NOLOCK);
+ return 0;
+}
+
+static void ext4_fast_commit_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* Initialize fast commit stuff */
+ atomic_set(&sbi->s_fc_subtid, 0);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ sbi->s_fc_bytes = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ sbi->s_fc_ineligible_tid = 0;
+ spin_lock_init(&sbi->s_fc_lock);
+ memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ sbi->s_fc_replay_state.fc_regions = NULL;
+ sbi->s_fc_replay_state.fc_regions_size = 0;
+ sbi->s_fc_replay_state.fc_regions_used = 0;
+ sbi->s_fc_replay_state.fc_regions_valid = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes = NULL;
+ sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+}
+
+static int ext4_inode_info_init(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -4575,16 +4436,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
sbi->s_first_ino);
- goto failed_mount;
+ return -EINVAL;
}
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
+ (sbi->s_inode_size > sb->s_blocksize)) {
ext4_msg(sb, KERN_ERR,
"unsupported inode size: %d",
sbi->s_inode_size);
- ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
- goto failed_mount;
+ ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
+ return -EINVAL;
}
/*
* i_atime_extra is the last extra field available for
@@ -4602,6 +4463,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
sb->s_time_min = EXT4_TIMESTAMP_MIN;
}
+
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
@@ -4613,7 +4475,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_want_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
@@ -4622,93 +4484,112 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (v > max) {
ext4_msg(sb, KERN_ERR,
"bad s_min_extra_isize: %d", v);
- goto failed_mount;
+ return -EINVAL;
}
if (sbi->s_want_extra_isize < v)
sbi->s_want_extra_isize = v;
}
}
- err = parse_apply_sb_mount_options(sb, ctx);
- if (err < 0)
- goto failed_mount;
+ return 0;
+}
- sbi->s_def_mount_opt = sbi->s_mount_opt;
+#if IS_ENABLED(CONFIG_UNICODE)
+static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ const struct ext4_sb_encodings *encoding_info;
+ struct unicode_map *encoding;
+ __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
- err = ext4_check_opt_consistency(fc, sb);
- if (err < 0)
- goto failed_mount;
+ if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
+ return 0;
- err = ext4_apply_options(fc, sb);
- if (err < 0)
- goto failed_mount;
+ encoding_info = ext4_sb_read_encoding(es);
+ if (!encoding_info) {
+ ext4_msg(sb, KERN_ERR,
+ "Encoding requested by superblock is unknown");
+ return -EINVAL;
+ }
-#if IS_ENABLED(CONFIG_UNICODE)
- if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
- const struct ext4_sb_encodings *encoding_info;
- struct unicode_map *encoding;
- __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);
+ encoding = utf8_load(encoding_info->version);
+ if (IS_ERR(encoding)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with superblock charset: %s-%u.%u.%u "
+ "not supported by the kernel. flags: 0x%x.",
+ encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
+ return -EINVAL;
+ }
+ ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
+ "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
+ unicode_major(encoding_info->version),
+ unicode_minor(encoding_info->version),
+ unicode_rev(encoding_info->version),
+ encoding_flags);
- encoding_info = ext4_sb_read_encoding(es);
- if (!encoding_info) {
- ext4_msg(sb, KERN_ERR,
- "Encoding requested by superblock is unknown");
- goto failed_mount;
- }
+ sb->s_encoding = encoding;
+ sb->s_encoding_flags = encoding_flags;
- encoding = utf8_load(encoding_info->version);
- if (IS_ERR(encoding)) {
- ext4_msg(sb, KERN_ERR,
- "can't mount with superblock charset: %s-%u.%u.%u "
- "not supported by the kernel. flags: 0x%x.",
- encoding_info->name,
- unicode_major(encoding_info->version),
- unicode_minor(encoding_info->version),
- unicode_rev(encoding_info->version),
- encoding_flags);
- goto failed_mount;
- }
- ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
- "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
- unicode_major(encoding_info->version),
- unicode_minor(encoding_info->version),
- unicode_rev(encoding_info->version),
- encoding_flags);
+ return 0;
+}
+#else
+static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
+{
+ return 0;
+}
+#endif
+
+static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
- sb->s_encoding = encoding;
- sb->s_encoding_flags = encoding_flags;
+ /* Warn if metadata_csum and gdt_csum are both set. */
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
+ ext4_warning(sb, "metadata_csum and uninit_bg are "
+ "redundant flags; please run fsck.");
+
+ /* Check for a known checksum algorithm */
+ if (!ext4_verify_csum_type(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "unknown checksum algorithm.");
+ return -EINVAL;
}
-#endif
+ ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
+ ext4_orphan_file_block_trigger);
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
- /* can't mount with both data=journal and dioread_nolock. */
- clear_opt(sb, DIOREAD_NOLOCK);
- clear_opt2(sb, JOURNAL_FAST_COMMIT);
- if (test_opt2(sb, EXPLICIT_DELALLOC)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and delalloc");
- goto failed_mount;
- }
- if (test_opt(sb, DAX_ALWAYS)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- goto failed_mount;
- }
- if (ext4_has_feature_encrypt(sb)) {
- ext4_msg(sb, KERN_WARNING,
- "encrypted files will use data=ordered "
- "instead of data journaling mode");
- }
- if (test_opt(sb, DELALLOC))
- clear_opt(sb, DELALLOC);
- } else {
- sb->s_iflags |= SB_I_CGROUPWB;
+ /* Load the checksum driver */
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ int ret = PTR_ERR(sbi->s_chksum_driver);
+ ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+ sbi->s_chksum_driver = NULL;
+ return ret;
}
- sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+ /* Check superblock checksum */
+ if (!ext4_superblock_csum_verify(sb, es)) {
+ ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+ "invalid superblock checksum. Run e2fsck?");
+ return -EFSBADCRC;
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
+ sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sizeof(es->s_uuid));
+ return 0;
+}
+static int ext4_check_feature_compatibility(struct super_block *sb,
+ struct ext4_super_block *es,
+ int silent)
+{
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(ext4_has_compat_features(sb) ||
ext4_has_ro_compat_features(sb) ||
@@ -4722,7 +4603,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
- goto failed_mount;
+ return -EINVAL;
}
/*
@@ -4732,7 +4613,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (ext4_has_feature_ea_inode(sb)) {
ext4_msg(sb, KERN_ERR,
"ea_inode feature is not supported for Hurd");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4746,10 +4627,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* it's actually an ext[34] filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4763,10 +4644,10 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* it's actually an ext4 filesystem.
*/
if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
- goto failed_mount;
+ return -EINVAL;
ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
"to feature incompatibilities");
- goto failed_mount;
+ return -EINVAL;
}
}
@@ -4776,9 +4657,463 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ext4_geometry_check(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ __u64 blocks_count;
+
+ /* check blocks count against device size */
+ blocks_count = sb_bdev_nr_blocks(sb);
+ if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
+ ext4_blocks_count(es), blocks_count);
+ return -EINVAL;
+ }
+
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block %u is beyond end of filesystem (%llu)",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
+ return -EINVAL;
+ }
+ if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
+ (sbi->s_cluster_ratio == 1)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block is 0 with a 1k block and cluster size");
+ return -EINVAL;
+ }
+
+ blocks_count = (ext4_blocks_count(es) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
+ "(block count %llu, first data block %u, "
+ "blocks per group %lu)", blocks_count,
+ ext4_blocks_count(es),
+ le32_to_cpu(es->s_first_data_block),
+ EXT4_BLOCKS_PER_GROUP(sb));
+ return -EINVAL;
+ }
+ sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+ if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
+ le32_to_cpu(es->s_inodes_count),
+ ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void ext4_group_desc_free(struct ext4_sb_info *sbi)
+{
+ struct buffer_head **group_desc;
+ int i;
+
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
+}
+
+static int ext4_group_desc_init(struct super_block *sb,
+ struct ext4_super_block *es,
+ ext4_fsblk_t logical_sb_block,
+ ext4_group_t *first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned int db_count;
+ ext4_fsblk_t block;
+ int ret;
+ int i;
+
+ db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ if (ext4_has_feature_meta_bg(sb)) {
+ if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
+ ext4_msg(sb, KERN_WARNING,
+ "first meta block group too large: %u "
+ "(group descriptor block count %u)",
+ le32_to_cpu(es->s_first_meta_bg), db_count);
+ return -EINVAL;
+ }
+ }
+ rcu_assign_pointer(sbi->s_group_desc,
+ kvmalloc_array(db_count,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL));
+ if (sbi->s_group_desc == NULL) {
+ ext4_msg(sb, KERN_ERR, "not enough memory");
+ return -ENOMEM;
+ }
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ /* Pre-read the descriptors into the buffer cache */
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ ext4_sb_breadahead_unmovable(sb, block);
+ }
+
+ for (i = 0; i < db_count; i++) {
+ struct buffer_head *bh;
+
+ block = descriptor_loc(sb, logical_sb_block, i);
+ bh = ext4_sb_bread_unmovable(sb, block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't read group descriptor %d", i);
+ sbi->s_gdb_count = i;
+ ret = PTR_ERR(bh);
+ goto out;
+ }
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_desc)[i] = bh;
+ rcu_read_unlock();
+ }
+ sbi->s_gdb_count = db_count;
+ if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
+ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+ return 0;
+out:
+ ext4_group_desc_free(sbi);
+ return ret;
+}
+
+static int ext4_load_and_init_journal(struct super_block *sb,
+ struct ext4_super_block *es,
+ struct ext4_fs_context *ctx)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ if (err)
+ return err;
+
+ if (ext4_has_feature_64bit(sb) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+ goto out;
+ }
+
+ if (!set_journal_csum_feature_set(sb)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+ "feature set");
+ goto out;
+ }
+
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to set fast commit journal feature");
+ goto out;
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ * capabilities: ORDERED_DATA if the journal can
+ * cope, else JOURNAL_DATA
+ */
+ if (jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ set_opt(sb, ORDERED_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ } else {
+ set_opt(sb, JOURNAL_DATA);
+ sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ }
+ break;
+
+ case EXT4_MOUNT_ORDERED_DATA:
+ case EXT4_MOUNT_WRITEBACK_DATA:
+ if (!jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
+ goto out;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ goto out;
+ }
+
+ set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
+
+ sbi->s_journal->j_submit_inode_data_buffers =
+ ext4_journal_submit_inode_data_buffers;
+ sbi->s_journal->j_finish_inode_data_buffers =
+ ext4_journal_finish_inode_data_buffers;
+
+ return 0;
+
+out:
+ /* flush s_error_work before journal destroy. */
+ flush_work(&sbi->s_error_work);
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ return err;
+}
+
+static int ext4_journal_data_mode_check(struct super_block *sb)
+{
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
+ "data=journal disables delayed allocation, "
+ "dioread_nolock, O_DIRECT and fast_commit support!\n");
+ /* can't mount with both data=journal and dioread_nolock. */
+ clear_opt(sb, DIOREAD_NOLOCK);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ return -EINVAL;
+ }
+ if (test_opt(sb, DAX_ALWAYS)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -EINVAL;
+ }
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "encrypted files will use data=ordered "
+ "instead of data journaling mode");
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
+ }
+
+ return 0;
+}
+
+static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
+ int silent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es;
+ ext4_fsblk_t logical_sb_block;
+ unsigned long offset = 0;
+ struct buffer_head *bh;
+ int ret = -EINVAL;
+ int blocksize;
+
+ blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+ if (!blocksize) {
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+ return -EINVAL;
+ }
+
+ /*
+ * The ext4 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ } else {
+ logical_sb_block = sbi->s_sb_block;
+ }
+
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ return PTR_ERR(bh);
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext4 macro-instructions depend on its value
+ */
+ es = (struct ext4_super_block *) (bh->b_data + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT4_SUPER_MAGIC) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto out;
+ }
+
+ if (le32_to_cpu(es->s_log_block_size) >
+ (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log block size: %u",
+ le32_to_cpu(es->s_log_block_size));
+ goto out;
+ }
+ if (le32_to_cpu(es->s_log_cluster_size) >
+ (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log cluster size: %u",
+ le32_to_cpu(es->s_log_cluster_size));
+ goto out;
+ }
+
+ blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ /*
+ * If the default block size is not the same as the real block size,
+ * we need to reload it.
+ */
+ if (sb->s_blocksize == blocksize) {
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+ }
+
+ /*
+ * bh must be released before kill_bdev(), otherwise
+ * it won't be freed and its page also. kill_bdev()
+ * is called by sb_set_blocksize().
+ */
+ brelse(bh);
+ /* Validate the filesystem blocksize */
+ if (!sb_set_blocksize(sb, blocksize)) {
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
+ blocksize);
+ bh = NULL;
+ goto out;
+ }
+
+ logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
+ ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
+ ret = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
+ es = (struct ext4_super_block *)(bh->b_data + offset);
+ sbi->s_es = es;
+ if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+ ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
+ goto out;
+ }
+ *lsb = logical_sb_block;
+ sbi->s_sbh = bh;
+ return 0;
+out:
+ brelse(bh);
+ return ret;
+}
+
+static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+{
+ struct ext4_super_block *es = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct flex_groups **flex_groups;
+ ext4_fsblk_t block;
+ ext4_fsblk_t logical_sb_block;
+ struct inode *root;
+ int ret = -ENOMEM;
+ unsigned int i;
+ int needs_recovery, has_huge_files;
+ int err = 0;
+ ext4_group_t first_not_zeroed;
+ struct ext4_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
+
+ /* Set defaults for the variables that will be set during parsing */
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
+
+ /* -EINVAL is default */
+ ret = -EINVAL;
+ err = ext4_load_super(sb, &logical_sb_block, silent);
+ if (err)
+ goto out_fail;
+
+ es = sbi->s_es;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+ err = ext4_init_metadata_csum(sb, es);
+ if (err)
+ goto failed_mount;
+
+ ext4_set_def_opts(sb, es);
+
+ sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+ sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+ if (ext4_inode_info_init(sb, es))
+ goto failed_mount;
+
+ err = parse_apply_sb_mount_options(sb, ctx);
+ if (err < 0)
+ goto failed_mount;
+
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+
+ err = ext4_check_opt_consistency(fc, sb);
+ if (err < 0)
goto failed_mount;
- if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
+ ext4_apply_options(fc, sb);
+
+ if (ext4_encoding_init(sb, es))
+ goto failed_mount;
+
+ if (ext4_journal_data_mode_check(sb))
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
+
+ /* i_version is always enabled now */
+ sb->s_flags |= SB_I_VERSION;
+
+ if (ext4_check_feature_compatibility(sb, es, silent))
+ goto failed_mount;
+
+ if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
ext4_msg(sb, KERN_ERR,
"Number of reserved GDT blocks insanely large: %d",
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
@@ -4786,7 +5121,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
if (sbi->s_daxdev) {
- if (blocksize == PAGE_SIZE)
+ if (sb->s_blocksize == PAGE_SIZE)
set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
else
ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
@@ -4811,40 +5146,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount;
}
- if (sb->s_blocksize != blocksize) {
- /*
- * bh must be released before kill_bdev(), otherwise
- * it won't be freed and its page also. kill_bdev()
- * is called by sb_set_blocksize().
- */
- brelse(bh);
- /* Validate the filesystem blocksize */
- if (!sb_set_blocksize(sb, blocksize)) {
- ext4_msg(sb, KERN_ERR, "bad block size %d",
- blocksize);
- bh = NULL;
- goto failed_mount;
- }
-
- logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
- offset = do_div(logical_sb_block, blocksize);
- bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR,
- "Can't read superblock on 2nd try");
- ret = PTR_ERR(bh);
- bh = NULL;
- goto failed_mount;
- }
- es = (struct ext4_super_block *)(bh->b_data + offset);
- sbi->s_es = es;
- if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- ext4_msg(sb, KERN_ERR,
- "Magic mismatch, very weird!");
- goto failed_mount;
- }
- }
-
has_huge_files = ext4_has_feature_huge_file(sb);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
@@ -4866,20 +5167,22 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
- if (sbi->s_inodes_per_block == 0)
- goto cantfind_ext4;
+ sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
+ if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto failed_mount;
+ }
if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
- sbi->s_inodes_per_group > blocksize * 8) {
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
sbi->s_inodes_per_group);
goto failed_mount;
}
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
- sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
- sbi->s_sbh = bh;
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
+ sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
@@ -4904,54 +5207,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
}
}
- /* Handle clustersize */
- clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- if (ext4_has_feature_bigalloc(sb)) {
- if (clustersize < blocksize) {
- ext4_msg(sb, KERN_ERR,
- "cluster size (%d) smaller than "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
- le32_to_cpu(es->s_log_block_size);
- sbi->s_clusters_per_group =
- le32_to_cpu(es->s_clusters_per_group);
- if (sbi->s_clusters_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#clusters per group too big: %lu",
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group !=
- (sbi->s_clusters_per_group * (clustersize / blocksize))) {
- ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
- "clusters per group (%lu) inconsistent",
- sbi->s_blocks_per_group,
- sbi->s_clusters_per_group);
- goto failed_mount;
- }
- } else {
- if (clustersize != blocksize) {
- ext4_msg(sb, KERN_ERR,
- "fragment/cluster size (%d) != "
- "block size (%d)", clustersize, blocksize);
- goto failed_mount;
- }
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
- }
- sbi->s_clusters_per_group = sbi->s_blocks_per_group;
- sbi->s_cluster_bits = 0;
- }
- sbi->s_cluster_ratio = clustersize / blocksize;
-
- /* Do we have standard group size of clustersize * 8 blocks ? */
- if (sbi->s_blocks_per_group == clustersize << 3)
- set_opt2(sb, STD_GROUP_SIZE);
+ if (ext4_handle_clustersize(sb))
+ goto failed_mount;
/*
* Test whether we have more sectors than will fit in sector_t,
@@ -4965,111 +5222,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount;
}
- if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
-
- /* check blocks count against device size */
- blocks_count = sb_bdev_nr_blocks(sb);
- if (blocks_count && ext4_blocks_count(es) > blocks_count) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
- "exceeds size of device (%llu blocks)",
- ext4_blocks_count(es), blocks_count);
- goto failed_mount;
- }
-
- /*
- * It makes no sense for the first data block to be beyond the end
- * of the filesystem.
- */
- if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block %u is beyond end of filesystem (%llu)",
- le32_to_cpu(es->s_first_data_block),
- ext4_blocks_count(es));
- goto failed_mount;
- }
- if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
- (sbi->s_cluster_ratio == 1)) {
- ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
- "block is 0 with a 1k block and cluster size");
+ if (ext4_geometry_check(sb, es))
goto failed_mount;
- }
- blocks_count = (ext4_blocks_count(es) -
- le32_to_cpu(es->s_first_data_block) +
- EXT4_BLOCKS_PER_GROUP(sb) - 1);
- do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
- if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
- ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
- "(block count %llu, first data block %u, "
- "blocks per group %lu)", blocks_count,
- ext4_blocks_count(es),
- le32_to_cpu(es->s_first_data_block),
- EXT4_BLOCKS_PER_GROUP(sb));
- goto failed_mount;
- }
- sbi->s_groups_count = blocks_count;
- sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
- (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
- if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
- le32_to_cpu(es->s_inodes_count)) {
- ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
- le32_to_cpu(es->s_inodes_count),
- ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
- ret = -EINVAL;
- goto failed_mount;
- }
- db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
- EXT4_DESC_PER_BLOCK(sb);
- if (ext4_has_feature_meta_bg(sb)) {
- if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
- ext4_msg(sb, KERN_WARNING,
- "first meta block group too large: %u "
- "(group descriptor block count %u)",
- le32_to_cpu(es->s_first_meta_bg), db_count);
- goto failed_mount;
- }
- }
- rcu_assign_pointer(sbi->s_group_desc,
- kvmalloc_array(db_count,
- sizeof(struct buffer_head *),
- GFP_KERNEL));
- if (sbi->s_group_desc == NULL) {
- ext4_msg(sb, KERN_ERR, "not enough memory");
- ret = -ENOMEM;
+ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
+ if (err)
goto failed_mount;
- }
-
- bgl_lock_init(sbi->s_blockgroup_lock);
-
- /* Pre-read the descriptors into the buffer cache */
- for (i = 0; i < db_count; i++) {
- block = descriptor_loc(sb, logical_sb_block, i);
- ext4_sb_breadahead_unmovable(sb, block);
- }
-
- for (i = 0; i < db_count; i++) {
- struct buffer_head *bh;
-
- block = descriptor_loc(sb, logical_sb_block, i);
- bh = ext4_sb_bread_unmovable(sb, block);
- if (IS_ERR(bh)) {
- ext4_msg(sb, KERN_ERR,
- "can't read group descriptor %d", i);
- db_count = i;
- ret = PTR_ERR(bh);
- goto failed_mount2;
- }
- rcu_read_lock();
- rcu_dereference(sbi->s_group_desc)[i] = bh;
- rcu_read_unlock();
- }
- sbi->s_gdb_count = db_count;
- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
- ret = -EFSCORRUPTED;
- goto failed_mount2;
- }
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
spin_lock_init(&sbi->s_error_lock);
@@ -5107,24 +5265,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
- /* Initialize fast commit stuff */
- atomic_set(&sbi->s_fc_subtid, 0);
- INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
- INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
- INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
- INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
- sbi->s_fc_bytes = 0;
- ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
- memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
- sbi->s_fc_replay_state.fc_regions = NULL;
- sbi->s_fc_replay_state.fc_regions_size = 0;
- sbi->s_fc_replay_state.fc_regions_used = 0;
- sbi->s_fc_replay_state.fc_regions_valid = 0;
- sbi->s_fc_replay_state.fc_modified_inodes = NULL;
- sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
- sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+ ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -5141,37 +5282,37 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
- err = ext4_load_journal(sb, es, ctx->journal_devnum);
+ err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
- goto failed_mount_wq;
+ goto failed_mount3a;
} else {
/* Nojournal mode, all journal mount options are illegal */
if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_checksum, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_async_commit, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"commit=%lu, fs mounted w/o journal",
sbi->s_commit_interval / HZ);
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"data=, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
@@ -5179,76 +5320,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
clear_opt2(sb, JOURNAL_FAST_COMMIT);
sbi->s_journal = NULL;
needs_recovery = 0;
- goto no_journal;
- }
-
- if (ext4_has_feature_64bit(sb) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_64BIT)) {
- ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
- goto failed_mount_wq;
- }
-
- if (!set_journal_csum_feature_set(sb)) {
- ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
- "feature set");
- goto failed_mount_wq;
- }
-
- if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
- !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
- JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
- ext4_msg(sb, KERN_ERR,
- "Failed to set fast commit journal feature");
- goto failed_mount_wq;
}
- /* We have now updated the journal if required, so we can
- * validate the data journaling mode. */
- switch (test_opt(sb, DATA_FLAGS)) {
- case 0:
- /* No mode set, assume a default based on the journal
- * capabilities: ORDERED_DATA if the journal can
- * cope, else JOURNAL_DATA
- */
- if (jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- set_opt(sb, ORDERED_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
- } else {
- set_opt(sb, JOURNAL_DATA);
- sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
- }
- break;
-
- case EXT4_MOUNT_ORDERED_DATA:
- case EXT4_MOUNT_WRITEBACK_DATA:
- if (!jbd2_journal_check_available_features
- (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- ext4_msg(sb, KERN_ERR, "Journal does not support "
- "requested data journaling mode");
- goto failed_mount_wq;
- }
- break;
- default:
- break;
- }
-
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit in data=ordered mode");
- goto failed_mount_wq;
- }
-
- set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
-
- sbi->s_journal->j_submit_inode_data_buffers =
- ext4_journal_submit_inode_data_buffers;
- sbi->s_journal->j_finish_inode_data_buffers =
- ext4_journal_finish_inode_data_buffers;
-
-no_journal:
if (!test_opt(sb, NO_MBCACHE)) {
sbi->s_ea_block_cache = ext4_xattr_create_cache();
if (!sbi->s_ea_block_cache) {
@@ -5267,24 +5340,27 @@ no_journal:
}
}
- if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ if (ext4_has_feature_verity(sb) && sb->s_blocksize != PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
}
- if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
- !ext4_has_feature_encrypt(sb)) {
- ext4_set_feature_encrypt(sb);
- ext4_commit_super(sb);
- }
-
/*
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -5386,14 +5462,6 @@ no_journal:
err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
GFP_KERNEL);
}
- /*
- * Update the checksum after updating free space/inode
- * counters. Otherwise the superblock can have an incorrect
- * checksum in the buffer cache until it is written out and
- * e2fsprogs programs trying to open a file system immediately
- * after it is mounted can fail.
- */
- ext4_superblock_csum_set(sb);
if (!err)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb), GFP_KERNEL);
@@ -5451,6 +5519,14 @@ no_journal:
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+ /*
+ * Update the checksum after updating free space/inode counters and
+ * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
+ * checksum in the buffer cache until it is written out and
+ * e2fsprogs programs trying to open a file system immediately
+ * after it is mounted can fail.
+ */
+ ext4_superblock_csum_set(sb);
if (needs_recovery) {
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
@@ -5458,13 +5534,9 @@ no_journal:
goto failed_mount9;
}
- if (test_opt(sb, DISCARD)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- ext4_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -5478,11 +5550,6 @@ no_journal:
return 0;
-cantfind_ext4:
- if (!silent)
- ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
- goto failed_mount;
-
failed_mount9:
ext4_release_orphan_info(sb);
failed_mount8:
@@ -5536,13 +5603,7 @@ failed_mount3:
flush_work(&sbi->s_error_work);
del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi);
-failed_mount2:
- rcu_read_lock();
- group_desc = rcu_dereference(sbi->s_group_desc);
- for (i = 0; i < db_count; i++)
- brelse(group_desc[i]);
- kvfree(group_desc);
- rcu_read_unlock();
+ ext4_group_desc_free(sbi);
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
@@ -5557,7 +5618,7 @@ failed_mount:
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
/* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
- brelse(bh);
+ brelse(sbi->s_sbh);
ext4_blkdev_remove(sbi);
out_fail:
sb->s_fs_info = NULL;
@@ -5602,6 +5663,8 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
"Quota mode: %s.", descr, ext4_quota_mode(sb));
+ /* Update the s_overhead_clusters if necessary */
+ ext4_update_overhead(sb, false);
return 0;
free_sbi:
@@ -5663,7 +5726,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
return NULL;
}
- jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
+ ext4_debug("Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
@@ -5984,7 +6047,6 @@ static void ext4_update_super(struct super_block *sb)
static int ext4_commit_super(struct super_block *sb)
{
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
- int error = 0;
if (!sbh)
return -EINVAL;
@@ -5993,6 +6055,13 @@ static int ext4_commit_super(struct super_block *sb)
ext4_update_super(sb);
+ lock_buffer(sbh);
+ /* Buffer got discarded which means block device got invalidated */
+ if (!buffer_mapped(sbh)) {
+ unlock_buffer(sbh);
+ return -EIO;
+ }
+
if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
@@ -6007,17 +6076,21 @@ static int ext4_commit_super(struct super_block *sb)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- BUFFER_TRACE(sbh, "marking dirty");
- mark_buffer_dirty(sbh);
- error = __sync_dirty_buffer(sbh,
- REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
+ get_bh(sbh);
+ /* Clear potential dirty bit if it was journalled update */
+ clear_buffer_dirty(sbh);
+ sbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE | REQ_SYNC |
+ (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+ wait_on_buffer(sbh);
if (buffer_write_io_error(sbh)) {
ext4_msg(sb, KERN_ERR, "I/O error while writing "
"superblock");
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
+ return -EIO;
}
- return error;
+ return 0;
}
/*
@@ -6258,7 +6331,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
char *to_free[EXT4_MAXQUOTAS];
#endif
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
/* Store the original options */
old_sb_flags = sb->s_flags;
@@ -6284,9 +6356,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
} else
old_opts.s_qf_names[i] = NULL;
#endif
- if (sbi->s_journal && sbi->s_journal->j_task->io_context)
- ctx->journal_ioprio =
- sbi->s_journal->j_task->io_context->ioprio;
+ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
+ if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+ ctx->journal_ioprio =
+ sbi->s_journal->j_task->io_context->ioprio;
+ else
+ ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+
+ }
ext4_apply_options(fc, sb);
@@ -6427,7 +6504,8 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if (err)
goto restore_opts;
}
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_mount_state = (le16_to_cpu(es->s_state) &
+ ~EXT4_FC_REPLAY);
err = ext4_setup_super(sb, es, 0);
if (err)
@@ -6706,7 +6784,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 69109746e6e2..3d3ed3c38f56 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -27,7 +27,7 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *cpage = NULL;
+ struct buffer_head *bh = NULL;
const void *caddr;
unsigned int max_size;
const char *paddr;
@@ -39,16 +39,19 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
caddr = EXT4_I(inode)->i_data;
max_size = sizeof(EXT4_I(inode)->i_data);
} else {
- cpage = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(cpage))
- return ERR_CAST(cpage);
- caddr = page_address(cpage);
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ caddr = bh->b_data;
max_size = inode->i_sb->s_blocksize;
}
paddr = fscrypt_get_symlink(inode, caddr, max_size, done);
- if (cpage)
- put_page(cpage);
+ brelse(bh);
return paddr;
}
@@ -62,6 +65,53 @@ static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns,
return fscrypt_symlink_getattr(path, stat);
}
+static void ext4_free_link(void *bh)
+{
+ brelse(bh);
+}
+
+static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ struct buffer_head *bh;
+ char *inline_link;
+
+ /*
+ * Create a new inlined symlink is not supported, just provide a
+ * method to read the leftovers.
+ */
+ if (ext4_has_inline_data(inode)) {
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ inline_link = ext4_read_inline_link(inode);
+ if (!IS_ERR(inline_link))
+ set_delayed_call(callback, kfree_link, inline_link);
+ return inline_link;
+ }
+
+ if (!dentry) {
+ bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh || !ext4_buffer_uptodate(bh))
+ return ERR_PTR(-ECHILD);
+ } else {
+ bh = ext4_bread(NULL, inode, 0, 0);
+ if (IS_ERR(bh))
+ return ERR_CAST(bh);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "bad symlink.");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ }
+
+ set_delayed_call(callback, ext4_free_link, bh);
+ nd_terminate_link(bh->b_data, inode->i_size,
+ inode->i_sb->s_blocksize - 1);
+ return bh->b_data;
+}
+
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
@@ -70,7 +120,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
};
const struct inode_operations ext4_symlink_inode_operations = {
- .get_link = page_get_link,
+ .get_link = ext4_get_link,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index eacbd489e3bf..20cadfb740dc 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -69,6 +69,9 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
static int pagecache_write(struct inode *inode, const void *buf, size_t count,
loff_t pos)
{
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+
if (pos + count > inode->i_sb->s_maxbytes)
return -EFBIG;
@@ -79,15 +82,13 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
void *fsdata;
int res;
- res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
- &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
if (res)
return res;
memcpy_to_page(page, offset_in_page(pos), buf, n);
- res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
- page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
if (res < 0)
return res;
if (res != n)
@@ -297,16 +298,14 @@ static int ext4_get_verity_descriptor_location(struct inode *inode,
last_extent = path[path->p_depth].p_ext;
if (!last_extent) {
EXT4_ERROR_INODE(inode, "verity file has no extents");
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
return -EFSCORRUPTED;
}
end_lblk = le32_to_cpu(last_extent->ee_block) +
ext4_ext_get_actual_len(last_extent);
desc_size_pos = (u64)end_lblk << inode->i_blkbits;
- ext4_ext_drop_refs(path);
- kfree(path);
+ ext4_free_ext_path(path);
if (desc_size_pos < sizeof(desc_size_disk))
goto bad;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 042325349098..36d6ba7190b6 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -436,6 +436,21 @@ error:
return err;
}
+/* Remove entry from mbcache when EA inode is getting evicted */
+void ext4_evict_ea_inode(struct inode *inode)
+{
+ struct mb_cache_entry *oe;
+
+ if (!EA_INODE_CACHE(inode))
+ return;
+ /* Wait for entry to get unused so that we can remove it */
+ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
+ ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
+ }
+}
+
static int
ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
struct ext4_xattr_entry *entry, void *buffer,
@@ -976,10 +991,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
- struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
struct ext4_iloc iloc;
s64 ref_count;
- u32 hash;
int ret;
inode_lock(ea_inode);
@@ -1002,14 +1015,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
set_nlink(ea_inode, 1);
ext4_orphan_del(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_create(ea_inode_cache,
- GFP_NOFS, hash,
- ea_inode->i_ino,
- true /* reusable */);
- }
}
} else {
WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
@@ -1022,12 +1027,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
clear_nlink(ea_inode);
ext4_orphan_add(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_delete(ea_inode_cache, hash,
- ea_inode->i_ino);
- }
}
}
@@ -1237,6 +1236,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (error)
goto out;
+retry_ref:
lock_buffer(bh);
hash = le32_to_cpu(BHDR(bh)->h_hash);
ref = le32_to_cpu(BHDR(bh)->h_refcount);
@@ -1246,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
* This must happen under buffer lock for
* ext4_xattr_block_set() to reliably detect freed block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (oe) {
+ unlock_buffer(bh);
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto retry_ref;
+ }
+ }
get_bh(bh);
unlock_buffer(bh);
@@ -1858,6 +1867,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
#define header(x) ((struct ext4_xattr_header *)(x))
if (s->base) {
+ int offset = (char *)s->here - bs->bh->b_data;
+
BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, sb, bs->bh,
EXT4_JTR_NONE);
@@ -1873,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* ext4_xattr_block_set() to reliably detect modified
* block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bs->bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache,
+ hash, bs->bh->b_blocknr);
+ if (oe) {
+ /*
+ * Xattr block is getting reused. Leave
+ * it alone.
+ */
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto clone_block;
+ }
+ }
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s, handle, inode,
true /* is_block */);
@@ -1890,50 +1912,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (error)
goto cleanup;
goto inserted;
- } else {
- int offset = (char *)s->here - bs->bh->b_data;
+ }
+clone_block:
+ unlock_buffer(bs->bh);
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+ s->end = s->base + bs->bh->b_size;
- unlock_buffer(bs->bh);
- ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
- error = -ENOMEM;
- if (s->base == NULL)
+ /*
+ * If existing entry points to an xattr inode, we need
+ * to prevent ext4_xattr_set_entry() from decrementing
+ * ref count on it because the reference belongs to the
+ * original block. In this case, make the entry look
+ * like it has an empty value.
+ */
+ if (!s->not_found && s->here->e_value_inum) {
+ ea_ino = le32_to_cpu(s->here->e_value_inum);
+ error = ext4_xattr_inode_iget(inode, ea_ino,
+ le32_to_cpu(s->here->e_hash),
+ &tmp_inode);
+ if (error)
goto cleanup;
- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
- s->first = ENTRY(header(s->base)+1);
- header(s->base)->h_refcount = cpu_to_le32(1);
- s->here = ENTRY(s->base + offset);
- s->end = s->base + bs->bh->b_size;
-
- /*
- * If existing entry points to an xattr inode, we need
- * to prevent ext4_xattr_set_entry() from decrementing
- * ref count on it because the reference belongs to the
- * original block. In this case, make the entry look
- * like it has an empty value.
- */
- if (!s->not_found && s->here->e_value_inum) {
- ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino,
- le32_to_cpu(s->here->e_hash),
- &tmp_inode);
- if (error)
- goto cleanup;
-
- if (!ext4_test_inode_state(tmp_inode,
- EXT4_STATE_LUSTRE_EA_INODE)) {
- /*
- * Defer quota free call for previous
- * inode until success is guaranteed.
- */
- old_ea_inode_quota = le32_to_cpu(
- s->here->e_value_size);
- }
- iput(tmp_inode);
- s->here->e_value_inum = 0;
- s->here->e_value_size = 0;
+ if (!ext4_test_inode_state(tmp_inode,
+ EXT4_STATE_LUSTRE_EA_INODE)) {
+ /*
+ * Defer quota free call for previous
+ * inode until success is guaranteed.
+ */
+ old_ea_inode_quota = le32_to_cpu(
+ s->here->e_value_size);
}
+ iput(tmp_inode);
+
+ s->here->e_value_inum = 0;
+ s->here->e_value_size = 0;
}
} else {
/* Allocate a buffer where we construct the new block. */
@@ -2000,18 +2019,13 @@ inserted:
lock_buffer(new_bh);
/*
* We have to be careful about races with
- * freeing, rehashing or adding references to
- * xattr block. Once we hold buffer lock xattr
- * block's state is stable so we can check
- * whether the block got freed / rehashed or
- * not. Since we unhash mbcache entry under
- * buffer lock when freeing / rehashing xattr
- * block, checking whether entry is still
- * hashed is reliable. Same rules hold for
- * e_reusable handling.
+ * adding references to xattr block. Once we
+ * hold buffer lock xattr block's state is
+ * stable so we can check the additional
+ * reference fits.
*/
- if (hlist_bl_unhashed(&ce->e_hash_list) ||
- !ce->e_reusable) {
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ if (ref > EXT4_XATTR_REFCOUNT_MAX) {
/*
* Undo everything and check mbcache
* again.
@@ -2026,9 +2040,8 @@ inserted:
new_bh = NULL;
goto inserted;
}
- ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
- if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+ if (ref == EXT4_XATTR_REFCOUNT_MAX)
ce->e_reusable = 0;
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
@@ -2176,8 +2189,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_inode *raw_inode;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return 0;
+
raw_inode = ext4_raw_inode(&is->iloc);
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
@@ -2205,8 +2219,9 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &is->s;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return -ENOSPC;
+
error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
if (error)
return error;
@@ -2397,6 +2412,7 @@ retry_inode:
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
if (!value)
no_expand = 0;
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 77efb9a627ad..824faf0b15a8 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -84,7 +84,7 @@ struct ext4_xattr_entry {
/*
* The minimum size of EA value when you start storing it in an external inode
* size of block - size of header - size of 1 entry - 4 null bytes
-*/
+ */
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \
((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
@@ -95,6 +95,19 @@ struct ext4_xattr_entry {
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+/*
+ * If we want to add an xattr to the inode, we should make sure that
+ * i_extra_isize is not 0 and that the inode size is not less than
+ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
+ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data
+ * |--------------------------|------------|------|---------|---|-------|
+ */
+#define EXT4_INODE_HAS_XATTR_SPACE(inode) \
+ ((EXT4_I(inode)->i_extra_isize != 0) && \
+ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \
+ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \
+ EXT4_INODE_SIZE((inode)->i_sb)))
+
struct ext4_xattr_info {
const char *name;
const void *value;
@@ -178,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
+extern void ext4_evict_ea_inode(struct inode *inode);
extern const struct xattr_handler *ext4_xattr_handlers[];
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f5366feea82d..8259e0fa97e1 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,13 +98,7 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
- if (page->index == sbi->metapage_eio_ofs &&
- sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) {
- set_ckpt_flags(sbi, CP_ERROR_FLAG);
- } else {
- sbi->metapage_eio_ofs = page->index;
- sbi->metapage_eio_cnt = 0;
- }
+ f2fs_handle_page_eio(sbi, page->index, META);
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -158,7 +152,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
blkaddr, exist);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- WARN_ON(1);
+ dump_stack();
}
return exist;
}
@@ -196,7 +190,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
f2fs_warn(sbi, "access invalid blkaddr:%u",
blkaddr);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- WARN_ON(1);
+ dump_stack();
return false;
} else {
return __is_bitmap_valid(sbi, blkaddr, type);
@@ -468,10 +462,8 @@ const struct address_space_operations f2fs_meta_aops = {
.writepages = f2fs_write_meta_pages,
.dirty_folio = f2fs_dirty_meta_folio,
.invalidate_folio = f2fs_invalidate_folio,
- .releasepage = f2fs_release_page,
-#ifdef CONFIG_MIGRATION
- .migratepage = f2fs_migrate_page,
-#endif
+ .release_folio = f2fs_release_folio,
+ .migrate_folio = filemap_migrate_folio,
};
static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
@@ -1010,9 +1002,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type)
return;
set_inode_flag(inode, flag);
- if (!f2fs_is_volatile_file(inode))
- list_add_tail(&F2FS_I(inode)->dirty_list,
- &sbi->inode_list[type]);
+ list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
stat_inc_dirty_inode(sbi, type);
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 12a56f9e1572..70e97075e535 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -729,14 +729,19 @@ out:
return ret;
}
-void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
+static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
+ bool pre_alloc);
+static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback, bool pre_alloc);
+
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
struct f2fs_inode_info *fi = F2FS_I(dic->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
+ bool bypass_callback = false;
int ret;
- int i;
trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
dic->cluster_size, fi->i_compress_algorithm);
@@ -746,41 +751,10 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
goto out_end_io;
}
- dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
- if (!dic->tpages) {
- ret = -ENOMEM;
- goto out_end_io;
- }
-
- for (i = 0; i < dic->cluster_size; i++) {
- if (dic->rpages[i]) {
- dic->tpages[i] = dic->rpages[i];
- continue;
- }
-
- dic->tpages[i] = f2fs_compress_alloc_page();
- if (!dic->tpages[i]) {
- ret = -ENOMEM;
- goto out_end_io;
- }
- }
-
- if (cops->init_decompress_ctx) {
- ret = cops->init_decompress_ctx(dic);
- if (ret)
- goto out_end_io;
- }
-
- dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
- if (!dic->rbuf) {
- ret = -ENOMEM;
- goto out_destroy_decompress_ctx;
- }
-
- dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
- if (!dic->cbuf) {
- ret = -ENOMEM;
- goto out_vunmap_rbuf;
+ ret = f2fs_prepare_decomp_mem(dic, false);
+ if (ret) {
+ bypass_callback = true;
+ goto out_release;
}
dic->clen = le32_to_cpu(dic->cbuf->clen);
@@ -788,7 +762,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
ret = -EFSCORRUPTED;
- goto out_vunmap_cbuf;
+ goto out_release;
}
ret = cops->decompress_pages(dic);
@@ -809,17 +783,13 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
}
}
-out_vunmap_cbuf:
- vm_unmap_ram(dic->cbuf, dic->nr_cpages);
-out_vunmap_rbuf:
- vm_unmap_ram(dic->rbuf, dic->cluster_size);
-out_destroy_decompress_ctx:
- if (cops->destroy_decompress_ctx)
- cops->destroy_decompress_ctx(dic);
+out_release:
+ f2fs_release_decomp_mem(dic, bypass_callback, false);
+
out_end_io:
trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
dic->clen, ret);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, in_task);
}
/*
@@ -829,7 +799,7 @@ out_end_io:
* (or in the case of a failure, cleans up without actually decompressing).
*/
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr)
+ block_t blkaddr, bool in_task)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
@@ -839,12 +809,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,
if (failed)
WRITE_ONCE(dic->failed, true);
- else if (blkaddr)
+ else if (blkaddr && in_task)
f2fs_cache_compressed_page(sbi, page,
dic->inode->i_ino, blkaddr);
if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, in_task);
}
static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -871,19 +841,26 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
return is_page_in_cluster(cc, index);
}
-bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
- int index, int nr_pages)
+bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages,
+ int index, int nr_pages, bool uptodate)
{
- unsigned long pgidx;
- int i;
+ unsigned long pgidx = pages[index]->index;
+ int i = uptodate ? 0 : 1;
- if (nr_pages - index < cc->cluster_size)
+ /*
+ * when uptodate set to true, try to check all pages in cluster is
+ * uptodate or not.
+ */
+ if (uptodate && (pgidx % cc->cluster_size))
return false;
- pgidx = pvec->pages[index]->index;
+ if (nr_pages - index < cc->cluster_size)
+ return false;
- for (i = 1; i < cc->cluster_size; i++) {
- if (pvec->pages[index + i]->index != pgidx + i)
+ for (; i < cc->cluster_size; i++) {
+ if (pages[index + i]->index != pgidx + i)
+ return false;
+ if (uptodate && !PageUptodate(pages[index + i]))
return false;
}
@@ -1552,16 +1529,85 @@ destroy_out:
return err;
}
-static void f2fs_free_dic(struct decompress_io_ctx *dic);
+static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi,
+ bool pre_alloc)
+{
+ return pre_alloc ^ f2fs_low_mem_mode(sbi);
+}
+
+static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
+ bool pre_alloc)
+{
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+ int i;
+
+ if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ return 0;
+
+ dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
+ if (!dic->tpages)
+ return -ENOMEM;
+
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (dic->rpages[i]) {
+ dic->tpages[i] = dic->rpages[i];
+ continue;
+ }
+
+ dic->tpages[i] = f2fs_compress_alloc_page();
+ if (!dic->tpages[i])
+ return -ENOMEM;
+ }
+
+ dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
+ if (!dic->rbuf)
+ return -ENOMEM;
+
+ dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
+ if (!dic->cbuf)
+ return -ENOMEM;
+
+ if (cops->init_decompress_ctx) {
+ int ret = cops->init_decompress_ctx(dic);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback, bool pre_alloc)
+{
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+
+ if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ return;
+
+ if (!bypass_destroy_callback && cops->destroy_decompress_ctx)
+ cops->destroy_decompress_ctx(dic);
+
+ if (dic->cbuf)
+ vm_unmap_ram(dic->cbuf, dic->nr_cpages);
+
+ if (dic->rbuf)
+ vm_unmap_ram(dic->rbuf, dic->cluster_size);
+}
+
+static void f2fs_free_dic(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
{
struct decompress_io_ctx *dic;
pgoff_t start_idx = start_idx_of_cluster(cc);
- int i;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+ int i, ret;
- dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO,
- false, F2FS_I_SB(cc->inode));
+ dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi);
if (!dic)
return ERR_PTR(-ENOMEM);
@@ -1587,32 +1633,43 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->nr_rpages = cc->cluster_size;
dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages);
- if (!dic->cpages)
+ if (!dic->cpages) {
+ ret = -ENOMEM;
goto out_free;
+ }
for (i = 0; i < dic->nr_cpages; i++) {
struct page *page;
page = f2fs_compress_alloc_page();
- if (!page)
+ if (!page) {
+ ret = -ENOMEM;
goto out_free;
+ }
f2fs_set_compressed_page(page, cc->inode,
start_idx + i + 1, dic);
dic->cpages[i] = page;
}
+ ret = f2fs_prepare_decomp_mem(dic, true);
+ if (ret)
+ goto out_free;
+
return dic;
out_free:
- f2fs_free_dic(dic);
- return ERR_PTR(-ENOMEM);
+ f2fs_free_dic(dic, true);
+ return ERR_PTR(ret);
}
-static void f2fs_free_dic(struct decompress_io_ctx *dic)
+static void f2fs_free_dic(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback)
{
int i;
+ f2fs_release_decomp_mem(dic, bypass_destroy_callback, true);
+
if (dic->tpages) {
for (i = 0; i < dic->cluster_size; i++) {
if (dic->rpages[i])
@@ -1637,17 +1694,33 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic)
kmem_cache_free(dic_entry_slab, dic);
}
-static void f2fs_put_dic(struct decompress_io_ctx *dic)
+static void f2fs_late_free_dic(struct work_struct *work)
{
- if (refcount_dec_and_test(&dic->refcnt))
- f2fs_free_dic(dic);
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, free_work);
+
+ f2fs_free_dic(dic, false);
+}
+
+static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
+{
+ if (refcount_dec_and_test(&dic->refcnt)) {
+ if (in_task) {
+ f2fs_free_dic(dic, false);
+ } else {
+ INIT_WORK(&dic->free_work, f2fs_late_free_dic);
+ queue_work(F2FS_I_SB(dic->inode)->post_read_wq,
+ &dic->free_work);
+ }
+ }
}
/*
* Update and unlock the cluster's pagecache pages, and release the reference to
* the decompress_io_ctx that was being held for I/O completion.
*/
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task)
{
int i;
@@ -1668,7 +1741,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
unlock_page(rpage);
}
- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_task);
}
static void f2fs_verify_cluster(struct work_struct *work)
@@ -1685,14 +1758,15 @@ static void f2fs_verify_cluster(struct work_struct *work)
SetPageError(rpage);
}
- __f2fs_decompress_end_io(dic, false);
+ __f2fs_decompress_end_io(dic, false, true);
}
/*
* This is called when a compressed cluster has been decompressed
* (or failed to be read and/or decompressed).
*/
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task)
{
if (!failed && dic->need_verity) {
/*
@@ -1704,7 +1778,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
fsverity_enqueue_verify_work(&dic->verity_work);
} else {
- __f2fs_decompress_end_io(dic, failed);
+ __f2fs_decompress_end_io(dic, failed, in_task);
}
}
@@ -1713,12 +1787,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
*
* This is called when the page is no longer needed and can be freed.
*/
-void f2fs_put_page_dic(struct page *page)
+void f2fs_put_page_dic(struct page *page, bool in_task)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_task);
}
/*
@@ -1746,7 +1820,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
}
const struct address_space_operations f2fs_compress_aops = {
- .releasepage = f2fs_release_page,
+ .release_folio = f2fs_release_folio,
.invalidate_folio = f2fs_invalidate_folio,
};
@@ -1832,45 +1906,40 @@ bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
{
struct address_space *mapping = sbi->compress_inode->i_mapping;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = 0;
pgoff_t end = MAX_BLKADDR(sbi);
if (!mapping->nrpages)
return;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
do {
- unsigned int nr_pages;
- int i;
+ unsigned int nr, i;
- nr_pages = pagevec_lookup_range(&pvec, mapping,
- &index, end - 1);
- if (!nr_pages)
+ nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
+ if (!nr)
break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < nr; i++) {
+ struct folio *folio = fbatch.folios[i];
- if (page->index > end)
- break;
-
- lock_page(page);
- if (page->mapping != mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
continue;
}
- if (ino != get_page_private_data(page)) {
- unlock_page(page);
+ if (ino != get_page_private_data(&folio->page)) {
+ folio_unlock(folio);
continue;
}
- generic_error_remove_page(mapping, page);
- unlock_page(page);
+ generic_error_remove_page(mapping, &folio->page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
} while (index < end);
}
@@ -1908,6 +1977,9 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
dev_t dev = sbi->sb->s_bdev->bd_dev;
char slab_name[32];
+ if (!f2fs_sb_has_compression(sbi))
+ return 0;
+
sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev));
sbi->page_array_slab_size = sizeof(struct page *) <<
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8e0c2e773c8d..93cc2ec51c2a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page)
if (f2fs_is_compressed_page(page))
return false;
- if ((S_ISREG(inode->i_mode) &&
- (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
+ if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
page_private_gcing(page))
return true;
return false;
@@ -120,7 +119,7 @@ struct bio_post_read_ctx {
block_t fs_blkaddr;
};
-static void f2fs_finish_read_bio(struct bio *bio)
+static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -134,12 +133,13 @@ static void f2fs_finish_read_bio(struct bio *bio)
if (f2fs_is_compressed_page(page)) {
if (bio->bi_status)
- f2fs_end_read_compressed_page(page, true, 0);
- f2fs_put_page_dic(page);
+ f2fs_end_read_compressed_page(page, true, 0,
+ in_task);
+ f2fs_put_page_dic(page, in_task);
continue;
}
- /* PG_error was set if decryption or verity failed. */
+ /* PG_error was set if verity failed. */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
/* will re-read again later */
@@ -185,14 +185,14 @@ static void f2fs_verify_bio(struct work_struct *work)
struct page *page = bv->bv_page;
if (!f2fs_is_compressed_page(page) &&
- !PageError(page) && !fsverity_verify_page(page))
+ !fsverity_verify_page(page))
SetPageError(page);
}
} else {
fsverity_verify_bio(bio);
}
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, true);
}
/*
@@ -204,7 +204,7 @@ static void f2fs_verify_bio(struct work_struct *work)
* can involve reading verity metadata pages from the file, and these verity
* metadata pages may be encrypted and/or compressed.
*/
-static void f2fs_verify_and_finish_bio(struct bio *bio)
+static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task)
{
struct bio_post_read_ctx *ctx = bio->bi_private;
@@ -212,7 +212,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
INIT_WORK(&ctx->work, f2fs_verify_bio);
fsverity_enqueue_verify_work(&ctx->work);
} else {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, in_task);
}
}
@@ -225,7 +225,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
* that the bio includes at least one compressed page. The actual decompression
* is done on a per-cluster basis, not a per-bio basis.
*/
-static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
+static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
+ bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -235,10 +236,9 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
bio_for_each_segment_all(bv, ctx->bio, iter_all) {
struct page *page = bv->bv_page;
- /* PG_error was set if decryption failed. */
if (f2fs_is_compressed_page(page))
- f2fs_end_read_compressed_page(page, PageError(page),
- blkaddr);
+ f2fs_end_read_compressed_page(page, false, blkaddr,
+ in_task);
else
all_compressed = false;
@@ -258,20 +258,24 @@ static void f2fs_post_read_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
- if (ctx->enabled_steps & STEP_DECRYPT)
- fscrypt_decrypt_bio(ctx->bio);
+ if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) {
+ f2fs_finish_read_bio(bio, true);
+ return;
+ }
if (ctx->enabled_steps & STEP_DECOMPRESS)
- f2fs_handle_step_decompress(ctx);
+ f2fs_handle_step_decompress(ctx, true);
- f2fs_verify_and_finish_bio(ctx->bio);
+ f2fs_verify_and_finish_bio(bio, true);
}
static void f2fs_read_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
struct bio_post_read_ctx *ctx;
+ bool intask = in_task();
iostat_update_and_unbind_ctx(bio, 0);
ctx = bio->bi_private;
@@ -282,16 +286,29 @@ static void f2fs_read_end_io(struct bio *bio)
}
if (bio->bi_status) {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, intask);
return;
}
- if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) {
- INIT_WORK(&ctx->work, f2fs_post_read_work);
- queue_work(ctx->sbi->post_read_wq, &ctx->work);
- } else {
- f2fs_verify_and_finish_bio(bio);
+ if (ctx) {
+ unsigned int enabled_steps = ctx->enabled_steps &
+ (STEP_DECRYPT | STEP_DECOMPRESS);
+
+ /*
+ * If we have only decompression step between decompression and
+ * decrypt, we don't need post processing for this.
+ */
+ if (enabled_steps == STEP_DECOMPRESS &&
+ !f2fs_low_mem_mode(sbi)) {
+ f2fs_handle_step_decompress(ctx, intask);
+ } else if (enabled_steps) {
+ INIT_WORK(&ctx->work, f2fs_post_read_work);
+ queue_work(ctx->sbi->post_read_wq, &ctx->work);
+ return;
+ }
}
+
+ f2fs_verify_and_finish_bio(bio, intask);
}
static void f2fs_write_end_io(struct bio *bio)
@@ -388,11 +405,23 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
-static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag)
+static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
{
unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
- unsigned int fua_flag = io_flag & temp_mask;
- unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
+ unsigned int fua_flag, meta_flag, io_flag;
+ blk_opf_t op_flags = 0;
+
+ if (fio->op != REQ_OP_WRITE)
+ return 0;
+ if (fio->type == DATA)
+ io_flag = fio->sbi->data_io_flag;
+ else if (fio->type == NODE)
+ io_flag = fio->sbi->node_io_flag;
+ else
+ return 0;
+
+ fua_flag = io_flag & temp_mask;
+ meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask;
/*
* data/node io flag bits per temp:
@@ -401,9 +430,10 @@ static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag)
* Cold | Warm | Hot | Cold | Warm | Hot |
*/
if ((1 << fio->temp) & meta_flag)
- fio->op_flags |= REQ_META;
+ op_flags |= REQ_META;
if ((1 << fio->temp) & fua_flag)
- fio->op_flags |= REQ_FUA;
+ op_flags |= REQ_FUA;
+ return op_flags;
}
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
@@ -413,14 +443,10 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
sector_t sector;
struct bio *bio;
- if (fio->type == DATA)
- __attach_io_flag(fio, sbi->data_io_flag);
- else if (fio->type == NODE)
- __attach_io_flag(fio, sbi->node_io_flag);
-
bdev = f2fs_target_device(sbi, fio->new_blkaddr, &sector);
- bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags, GFP_NOIO,
- &f2fs_bioset);
+ bio = bio_alloc_bioset(bdev, npages,
+ fio->op | fio->op_flags | f2fs_io_flags(fio),
+ GFP_NOIO, &f2fs_bioset);
bio->bi_iter.bi_sector = sector;
if (is_read_io(fio->op)) {
bio->bi_end_io = f2fs_read_end_io;
@@ -576,6 +602,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
return false;
}
+int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ for (i = 0; i < NR_PAGE_TYPE; i++) {
+ int n = (i == META) ? 1 : NR_TEMP_TYPE;
+ int j;
+
+ sbi->write_io[i] = f2fs_kmalloc(sbi,
+ array_size(n, sizeof(struct f2fs_bio_info)),
+ GFP_KERNEL);
+ if (!sbi->write_io[i])
+ return -ENOMEM;
+
+ for (j = HOT; j < n; j++) {
+ init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
+ sbi->write_io[i][j].sbi = sbi;
+ sbi->write_io[i][j].bio = NULL;
+ spin_lock_init(&sbi->write_io[i][j].io_lock);
+ INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
+ INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
+ init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ }
+ }
+
+ return 0;
+}
+
static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
enum page_type type, enum temp_type temp)
{
@@ -963,7 +1017,7 @@ out:
}
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
- unsigned nr_pages, unsigned op_flag,
+ unsigned nr_pages, blk_opf_t op_flag,
pgoff_t first_idx, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1011,7 +1065,8 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
/* This can handle encryption stuffs */
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
- block_t blkaddr, int op_flags, bool for_write)
+ block_t blkaddr, blk_opf_t op_flags,
+ bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
@@ -1145,7 +1200,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
}
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- int op_flags, bool for_write)
+ blk_opf_t op_flags, bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -1645,8 +1700,6 @@ sync_out:
*/
f2fs_wait_on_block_writeback_range(inode,
map->m_pblk, map->m_len);
- invalidate_mapping_pages(META_MAPPING(sbi),
- map->m_pblk, map->m_pblk);
if (map->m_multidev_dio) {
block_t blk_addr = map->m_pblk;
@@ -2186,7 +2239,7 @@ skip_reading_dnode:
if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, true);
continue;
}
@@ -2204,7 +2257,7 @@ submit_and_realloc:
page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, true);
f2fs_put_dnode(&dn);
*bio_ret = NULL;
return ret;
@@ -2363,8 +2416,9 @@ next_page:
return ret;
}
-static int f2fs_read_data_page(struct file *file, struct page *page)
+static int f2fs_read_data_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page_file_mapping(page)->host;
int ret = -EAGAIN;
@@ -2554,7 +2608,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
bool ipu_force = false;
int err = 0;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
+ /* Use COW inode to make dnode_of_data for atomic write */
+ if (f2fs_is_atomic_file(inode))
+ set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0);
+ else
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+
if (need_inplace_update(fio) &&
f2fs_lookup_extent_cache(inode, page->index, &ei)) {
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
@@ -2591,6 +2650,7 @@ got_it:
err = -EFSCORRUPTED;
goto out_writepage;
}
+
/*
* If current allocation needs SSR,
* it had better in-place writes for updated data.
@@ -2687,6 +2747,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.submitted = false,
.compr_blocks = compr_blocks,
.need_lock = LOCK_RETRY,
+ .post_read = f2fs_post_read_required(inode),
.io_type = io_type,
.io_wbc = wbc,
.bio = bio,
@@ -2727,11 +2788,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
write:
if (f2fs_is_drop_cache(inode))
goto out;
- /* we should not write 0'th page having journal header */
- if (f2fs_is_volatile_file(inode) && (!page->index ||
- (!wbc->for_reclaim &&
- f2fs_available_free_memory(sbi, BASE_CHECK))))
- goto redirty_out;
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) {
@@ -2863,7 +2919,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
{
int ret = 0;
int done = 0, retry = 0;
- struct pagevec pvec;
+ struct page *pages[F2FS_ONSTACK_PAGES];
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
struct bio *bio = NULL;
sector_t last_block;
@@ -2894,8 +2950,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int submitted = 0;
int i;
- pagevec_init(&pvec);
-
if (get_dirty_pages(mapping->host) <=
SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
set_inode_flag(mapping->host, FI_HOT_DATA);
@@ -2921,13 +2975,13 @@ retry:
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && !retry && (index <= end)) {
- nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
- tag);
+ nr_pages = find_get_pages_range_tag(mapping, &index, end,
+ tag, F2FS_ONSTACK_PAGES, pages);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
+ struct page *page = pages[i];
bool need_readd;
readd:
need_readd = false;
@@ -2958,6 +3012,10 @@ readd:
if (!f2fs_cluster_is_empty(&cc))
goto lock_page;
+ if (f2fs_all_cluster_page_ready(&cc,
+ pages, i, nr_pages, true))
+ goto lock_page;
+
ret2 = f2fs_prepare_compress_overwrite(
inode, &pagep,
page->index, &fsdata);
@@ -2968,8 +3026,8 @@ readd:
} else if (ret2 &&
(!f2fs_compress_write_end(inode,
fsdata, page->index, 1) ||
- !f2fs_all_cluster_page_loaded(&cc,
- &pvec, i, nr_pages))) {
+ !f2fs_all_cluster_page_ready(&cc,
+ pages, i, nr_pages, false))) {
retry = 1;
break;
}
@@ -3059,7 +3117,7 @@ next:
if (need_readd)
goto readd;
}
- pagevec_release(&pvec);
+ release_pages(pages, nr_pages);
cond_resched();
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -3304,33 +3362,119 @@ unlock_out:
return err;
}
+static int __find_data_block(struct inode *inode, pgoff_t index,
+ block_t *blk_addr)
+{
+ struct dnode_of_data dn;
+ struct page *ipage;
+ struct extent_info ei = {0, };
+ int err = 0;
+
+ ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ } else {
+ /* hole case */
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err) {
+ dn.data_blkaddr = NULL_ADDR;
+ err = 0;
+ }
+ }
+ *blk_addr = dn.data_blkaddr;
+ f2fs_put_dnode(&dn);
+ return err;
+}
+
+static int __reserve_data_block(struct inode *inode, pgoff_t index,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ struct page *ipage;
+ int err = 0;
+
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto unlock_out;
+ }
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ err = f2fs_get_block(&dn, index);
+
+ *blk_addr = dn.data_blkaddr;
+ *node_changed = dn.node_changed;
+ f2fs_put_dnode(&dn);
+
+unlock_out:
+ f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ return err;
+}
+
+static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
+ struct page *page, loff_t pos, unsigned int len,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct inode *inode = page->mapping->host;
+ struct inode *cow_inode = F2FS_I(inode)->cow_inode;
+ pgoff_t index = page->index;
+ int err = 0;
+ block_t ori_blk_addr = NULL_ADDR;
+
+ /* If pos is beyond the end of file, reserve a new block in COW inode */
+ if ((pos & PAGE_MASK) >= i_size_read(inode))
+ goto reserve_block;
+
+ /* Look for the block in COW inode first */
+ err = __find_data_block(cow_inode, index, blk_addr);
+ if (err)
+ return err;
+ else if (*blk_addr != NULL_ADDR)
+ return 0;
+
+ /* Look for the block in the original inode */
+ err = __find_data_block(inode, index, &ori_blk_addr);
+ if (err)
+ return err;
+
+reserve_block:
+ /* Finally, we should reserve a new block in COW inode for the update */
+ err = __reserve_data_block(cow_inode, index, blk_addr, node_changed);
+ if (err)
+ return err;
+ inc_atomic_write_cnt(inode);
+
+ if (ori_blk_addr != NULL_ADDR)
+ *blk_addr = ori_blk_addr;
+ return 0;
+}
+
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
- bool need_balance = false, drop_atomic = false;
+ bool need_balance = false;
block_t blkaddr = NULL_ADDR;
int err = 0;
- trace_f2fs_write_begin(inode, pos, len, flags);
+ trace_f2fs_write_begin(inode, pos, len);
if (!f2fs_is_checkpoint_ready(sbi)) {
err = -ENOSPC;
goto fail;
}
- if ((f2fs_is_atomic_file(inode) &&
- !f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
- is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
- err = -ENOMEM;
- drop_atomic = true;
- goto fail;
- }
-
/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
@@ -3378,7 +3522,11 @@ repeat:
*pagep = page;
- err = prepare_write_begin(sbi, page, pos, len,
+ if (f2fs_is_atomic_file(inode))
+ err = prepare_atomic_write_begin(sbi, page, pos, len,
+ &blkaddr, &need_balance);
+ else
+ err = prepare_write_begin(sbi, page, pos, len,
&blkaddr, &need_balance);
if (err)
goto fail;
@@ -3434,8 +3582,6 @@ repeat:
fail:
f2fs_put_page(page, 1);
f2fs_write_failed(inode, pos + len);
- if (drop_atomic)
- f2fs_drop_inmem_pages_all(sbi, false);
return err;
}
@@ -3479,8 +3625,12 @@ static int f2fs_write_end(struct file *file,
set_page_dirty(page);
if (pos + copied > i_size_read(inode) &&
- !f2fs_verity_in_progress(inode))
+ !f2fs_verity_in_progress(inode)) {
f2fs_i_size_write(inode, pos + copied);
+ if (f2fs_is_atomic_file(inode))
+ f2fs_i_size_write(F2FS_I(inode)->cow_inode,
+ pos + copied);
+ }
unlock_out:
f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -3513,34 +3663,29 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
inode->i_ino == F2FS_COMPRESS_INO(sbi))
clear_page_private_data(&folio->page);
- if (page_private_atomic(&folio->page))
- return f2fs_drop_inmem_page(inode, &folio->page);
-
folio_detach_private(folio);
}
-int f2fs_release_page(struct page *page, gfp_t wait)
+bool f2fs_release_folio(struct folio *folio, gfp_t wait)
{
- /* If this is dirty page, keep PagePrivate */
- if (PageDirty(page))
- return 0;
+ struct f2fs_sb_info *sbi;
- /* This is atomic written page, keep Private */
- if (page_private_atomic(page))
- return 0;
+ /* If this is dirty folio, keep private data */
+ if (folio_test_dirty(folio))
+ return false;
- if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) {
- struct inode *inode = page->mapping->host;
+ sbi = F2FS_M_SB(folio->mapping);
+ if (test_opt(sbi, COMPRESS_CACHE)) {
+ struct inode *inode = folio->mapping->host;
- if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode)))
- clear_page_private_data(page);
+ if (inode->i_ino == F2FS_COMPRESS_INO(sbi))
+ clear_page_private_data(&folio->page);
}
- clear_page_private_gcing(page);
+ clear_page_private_gcing(&folio->page);
- detach_page_private(page);
- set_page_private(page, 0);
- return 1;
+ folio_detach_private(folio);
+ return true;
}
static bool f2fs_dirty_data_folio(struct address_space *mapping,
@@ -3554,18 +3699,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
folio_mark_uptodate(folio);
BUG_ON(folio_test_swapcache(folio));
- if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
- if (!page_private_atomic(&folio->page)) {
- f2fs_register_inmem_page(inode, &folio->page);
- return true;
- }
- /*
- * Previously, this page has been registered, we just
- * return here.
- */
- return false;
- }
-
if (!folio_test_dirty(folio)) {
filemap_dirty_folio(mapping, folio);
f2fs_update_dirty_folio(inode, folio);
@@ -3639,70 +3772,6 @@ out:
return blknr;
}
-#ifdef CONFIG_MIGRATION
-#include <linux/migrate.h>
-
-int f2fs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
-{
- int rc, extra_count;
- struct f2fs_inode_info *fi = F2FS_I(mapping->host);
- bool atomic_written = page_private_atomic(page);
-
- BUG_ON(PageWriteback(page));
-
- /* migrating an atomic written page is safe with the inmem_lock hold */
- if (atomic_written) {
- if (mode != MIGRATE_SYNC)
- return -EBUSY;
- if (!mutex_trylock(&fi->inmem_lock))
- return -EAGAIN;
- }
-
- /* one extra reference was held for atomic_write page */
- extra_count = atomic_written ? 1 : 0;
- rc = migrate_page_move_mapping(mapping, newpage,
- page, extra_count);
- if (rc != MIGRATEPAGE_SUCCESS) {
- if (atomic_written)
- mutex_unlock(&fi->inmem_lock);
- return rc;
- }
-
- if (atomic_written) {
- struct inmem_pages *cur;
-
- list_for_each_entry(cur, &fi->inmem_pages, list)
- if (cur->page == page) {
- cur->page = newpage;
- break;
- }
- mutex_unlock(&fi->inmem_lock);
- put_page(page);
- get_page(newpage);
- }
-
- /* guarantee to start from no stale private field */
- set_page_private(newpage, 0);
- if (PagePrivate(page)) {
- set_page_private(newpage, page_private(page));
- SetPagePrivate(newpage);
- get_page(newpage);
-
- set_page_private(page, 0);
- ClearPagePrivate(page);
- put_page(page);
- }
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
-
- return MIGRATEPAGE_SUCCESS;
-}
-#endif
-
#ifdef CONFIG_SWAP
static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
unsigned int blkcnt)
@@ -3927,22 +3996,20 @@ static void f2fs_swap_deactivate(struct file *file)
#endif
const struct address_space_operations f2fs_dblock_aops = {
- .readpage = f2fs_read_data_page,
+ .read_folio = f2fs_read_data_folio,
.readahead = f2fs_readahead,
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
.dirty_folio = f2fs_dirty_data_folio,
+ .migrate_folio = filemap_migrate_folio,
.invalidate_folio = f2fs_invalidate_folio,
- .releasepage = f2fs_release_page,
+ .release_folio = f2fs_release_folio,
.direct_IO = noop_direct_IO,
.bmap = f2fs_bmap,
.swap_activate = f2fs_swap_activate,
.swap_deactivate = f2fs_swap_deactivate,
-#ifdef CONFIG_MIGRATION
- .migratepage = f2fs_migrate_page,
-#endif
};
void f2fs_clear_page_cache_dirty_tag(struct page *page)
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fcdf253cd211..c01471573977 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -39,7 +39,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = BLKS_PER_SEC(sbi);
+ blks_per_sec = CAP_BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, true);
@@ -91,11 +91,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
- si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->aw_cnt = sbi->atomic_files;
- si->vw_cnt = atomic_read(&sbi->vw_cnt);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
- si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ);
si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE);
si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
@@ -167,8 +164,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
si->io_skip_bggc = sbi->io_skip_bggc;
si->other_skip_bggc = sbi->other_skip_bggc;
- si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
- si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC];
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
/ 2;
@@ -296,7 +291,6 @@ get_cache:
sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
sizeof(struct nat_entry_set);
- si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
si->cache_mem += atomic_read(&sbi->total_ext_tree) *
@@ -491,10 +485,6 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_data_blks);
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
si->bg_node_blks);
- seq_printf(s, "Skipped : atomic write %llu (%llu)\n",
- si->skipped_atomic_files[BG_GC] +
- si->skipped_atomic_files[FG_GC],
- si->skipped_atomic_files[BG_GC]);
seq_printf(s, "BG skip : IO: %u, Other: %u\n",
si->io_skip_bggc, si->other_skip_bggc);
seq_puts(s, "\nExtent Cache:\n");
@@ -519,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v)
si->flush_list_empty,
si->nr_discarding, si->nr_discarded,
si->nr_discard_cmd, si->undiscard_blks);
- seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
- "volatile IO: %4d (Max. %4d)\n",
- si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
- si->vw_cnt, si->max_vw_cnt);
+ seq_printf(s, " - atomic IO: %4d (Max. %4d)\n",
+ si->aw_cnt, si->max_aw_cnt);
seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
@@ -623,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
- atomic_set(&sbi->vw_cnt, 0);
atomic_set(&sbi->max_aw_cnt, 0);
- atomic_set(&sbi->max_vw_cnt, 0);
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a0e51937d92e..d5bd7932fb64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir,
#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = dir->i_sb;
- if (IS_CASEFOLDED(dir)) {
+ if (IS_CASEFOLDED(dir) &&
+ !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) {
fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
GFP_NOFS, false, F2FS_SB(sb));
if (!fname->cf_name.name)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cd1e65bcf0b0..aea816a133a8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -18,6 +18,7 @@
#include <linux/kobject.h>
#include <linux/sched.h>
#include <linux/cred.h>
+#include <linux/sched/mm.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -154,11 +155,11 @@ struct f2fs_mount_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
/* For which write hints are passed down to block layer */
- int whint_mode;
int alloc_mode; /* segment allocation policy */
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
+ int memory_mode; /* memory mode */
int discard_unit; /*
* discard command's offset/size should
* be aligned to this unit: block,
@@ -229,7 +230,6 @@ enum {
#define CP_PAUSE 0x00000040
#define CP_RESIZE 0x00000080
-#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */
#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */
#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */
@@ -509,11 +509,11 @@ struct f2fs_filename {
#if IS_ENABLED(CONFIG_UNICODE)
/*
* For casefolded directories: the casefolded name, but it's left NULL
- * if the original name is not valid Unicode, if the directory is both
- * casefolded and encrypted and its encryption key is unavailable, or if
- * the filesystem is doing an internal operation where usr_fname is also
- * NULL. In all these cases we fall back to treating the name as an
- * opaque byte sequence.
+ * if the original name is not valid Unicode, if the original name is
+ * "." or "..", if the directory is both casefolded and encrypted and
+ * its encryption key is unavailable, or if the filesystem is doing an
+ * internal operation where usr_fname is also NULL. In all these cases
+ * we fall back to treating the name as an opaque byte sequence.
*/
struct fscrypt_str cf_name;
#endif
@@ -579,8 +579,8 @@ enum {
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
-/* maximum retry of EIO'ed meta page */
-#define MAX_RETRY_META_PAGE_EIO 100
+/* maximum retry of EIO'ed page */
+#define MAX_RETRY_PAGE_EIO 100
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
@@ -598,6 +598,8 @@ enum {
#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
#define RECOVERY_MIN_RA_BLOCKS 1
+#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
union {
@@ -717,7 +719,6 @@ enum {
enum {
GC_FAILURE_PIN,
- GC_FAILURE_ATOMIC,
MAX_GC_FAILURE
};
@@ -739,8 +740,6 @@ enum {
FI_UPDATE_WRITE, /* inode has in-place-update data */
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
- FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
- FI_VOLATILE_FILE, /* indicate volatile file */
FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
@@ -753,7 +752,6 @@ enum {
FI_EXTRA_ATTR, /* indicate file has extra attribute */
FI_PROJ_INHERIT, /* indicate file inherits projectid */
FI_PIN_FILE, /* indicate file should not be gced */
- FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */
@@ -761,6 +759,7 @@ enum {
FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */
FI_COMPRESS_RELEASED, /* compressed blocks were released */
FI_ALIGNED_WRITE, /* enable aligned write */
+ FI_COW_FILE, /* indicate COW file */
FI_MAX, /* max flag, never be used */
};
@@ -795,11 +794,9 @@ struct f2fs_inode_info {
#endif
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
- struct list_head inmem_ilist; /* list for inmem inodes */
- struct list_head inmem_pages; /* inmemory pages managed by f2fs */
- struct task_struct *inmem_task; /* store inmemory task */
- struct mutex inmem_lock; /* lock for inmemory pages */
+ struct task_struct *atomic_write_task; /* store atomic write task */
struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct inode *cow_inode; /* copy-on-write inode for atomic write */
/* avoid racing between foreground op and gc */
struct f2fs_rwsem i_gc_rwsem[2];
@@ -818,6 +815,8 @@ struct f2fs_inode_info {
unsigned char i_compress_level; /* compress level (lz4hc,zstd) */
unsigned short i_compress_flag; /* compress flag */
unsigned int i_cluster_size; /* cluster size */
+
+ unsigned int atomic_write_cnt;
};
static inline void get_extent_info(struct extent_info *ext,
@@ -1093,7 +1092,6 @@ enum count_type {
F2FS_DIRTY_QDATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
- F2FS_INMEM_PAGES,
F2FS_DIRTY_IMETA,
F2FS_WB_CP_DATA,
F2FS_WB_DATA,
@@ -1118,16 +1116,12 @@ enum count_type {
*/
#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
enum page_type {
- DATA,
- NODE,
+ DATA = 0,
+ NODE = 1, /* should not change this */
META,
NR_PAGE_TYPE,
META_FLUSH,
- INMEM, /* the below types are used by tracepoints only. */
- INMEM_DROP,
- INMEM_INVALIDATE,
- INMEM_REVOKE,
- IPU,
+ IPU, /* the below types are used by tracepoints only. */
OPU,
};
@@ -1194,8 +1188,8 @@ struct f2fs_io_info {
nid_t ino; /* inode number */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
enum temp_type temp; /* contains HOT/WARM/COLD */
- int op; /* contains REQ_OP_ */
- int op_flags; /* req_flag_bits */
+ enum req_op op; /* contains REQ_OP_ */
+ blk_opf_t op_flags; /* req_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
@@ -1209,6 +1203,7 @@ struct f2fs_io_info {
bool retry; /* need to reallocate block address */
int compr_blocks; /* # of compressed block addresses */
bool encrypted; /* indicate file is encrypted */
+ bool post_read; /* require post read */
enum iostat_type io_type; /* io type */
struct writeback_control *io_wbc; /* writeback control */
struct bio **bio; /* bio for ipu */
@@ -1245,7 +1240,6 @@ struct f2fs_dev_info {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_blkz; /* Total number of zones */
unsigned long *blkz_seq; /* Bitmap indicating sequential zones */
- block_t *zone_capacity_blocks; /* Array of zone capacity in blks */
#endif
};
@@ -1277,6 +1271,15 @@ struct atgc_management {
unsigned long long age_threshold; /* age threshold */
};
+struct f2fs_gc_control {
+ unsigned int victim_segno; /* target victim segment number */
+ int init_gc_type; /* FG_GC or BG_GC */
+ bool no_bg_gc; /* check the space and stop bg_gc */
+ bool should_migrate_blocks; /* should migrate blocks */
+ bool err_gc_skipped; /* return EAGAIN if GC skipped */
+ unsigned int nr_free_secs; /* # of free sections to do GC */
+};
+
/* For s_flag in struct f2fs_sb_info */
enum {
SBI_IS_DIRTY, /* dirty flag for checkpoint */
@@ -1334,12 +1337,6 @@ enum {
};
enum {
- WHINT_MODE_OFF, /* not pass down write hints */
- WHINT_MODE_USER, /* try to pass down hints given by users */
- WHINT_MODE_FS, /* pass down hints with F2FS policy */
-};
-
-enum {
ALLOC_MODE_DEFAULT, /* stay default */
ALLOC_MODE_REUSE, /* reuse segments as much as possible */
};
@@ -1368,6 +1365,13 @@ enum {
DISCARD_UNIT_SECTION, /* basic discard unit is section */
};
+enum {
+ MEMORY_MODE_NORMAL, /* memory mode for normal devices */
+ MEMORY_MODE_LOW, /* memory mode for low memry devices */
+};
+
+
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1588,6 +1592,7 @@ struct decompress_io_ctx {
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
struct work_struct verity_work; /* work to verify the decompressed pages */
+ struct work_struct free_work; /* work for late free this structure itself */
};
#define NULL_CLUSTER ((unsigned int)(~0))
@@ -1621,8 +1626,8 @@ struct f2fs_sb_info {
/* keep migration IO order for LFS mode */
struct f2fs_rwsem io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
- pgoff_t metapage_eio_ofs; /* EIO page offset */
- int metapage_eio_cnt; /* EIO count */
+ pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */
+ int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -1672,6 +1677,7 @@ struct f2fs_sb_info {
unsigned int meta_ino_num; /* meta inode number*/
unsigned int log_blocks_per_seg; /* log2 blocks per segment */
unsigned int blocks_per_seg; /* blocks per segment */
+ unsigned int unusable_blocks_per_sec; /* unusable blocks per section */
unsigned int segs_per_sec; /* segments per section */
unsigned int secs_per_zone; /* sections per zone */
unsigned int total_sections; /* total section count */
@@ -1725,7 +1731,6 @@ struct f2fs_sb_info {
/* for skip statistic */
unsigned int atomic_files; /* # of opened atomic file */
- unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
@@ -1756,9 +1761,7 @@ struct f2fs_sb_info {
atomic_t inline_dir; /* # of inline_dentry inodes */
atomic_t compr_inode; /* # of compressed inodes */
atomic64_t compr_blocks; /* # of compressed blocks */
- atomic_t vw_cnt; /* # of volatile writes */
atomic_t max_aw_cnt; /* max # of atomic writes */
- atomic_t max_vw_cnt; /* max # of volatile writes */
unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
unsigned int other_skip_bggc; /* skip background gc for other reasons */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
@@ -1769,7 +1772,7 @@ struct f2fs_sb_info {
unsigned int data_io_flag;
unsigned int node_io_flag;
- /* For sysfs suppport */
+ /* For sysfs support */
struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */
struct completion s_kobj_unregister;
@@ -1815,6 +1818,12 @@ struct f2fs_sb_info {
int max_fragment_chunk; /* max chunk size for block fragmentation mode */
int max_fragment_hole; /* max hole size for block fragmentation mode */
+ /* For atomic write statistics */
+ atomic64_t current_atomic_write;
+ s64 peak_atomic_write;
+ u64 committed_atomic_block;
+ u64 revoked_atomic_block;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -2429,6 +2438,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
}
+static inline void inc_atomic_write_cnt(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ u64 current_write;
+
+ fi->atomic_write_cnt++;
+ atomic64_inc(&sbi->current_atomic_write);
+ current_write = atomic64_read(&sbi->current_atomic_write);
+ if (current_write > sbi->peak_atomic_write)
+ sbi->peak_atomic_write = current_write;
+}
+
+static inline void release_atomic_write_cnt(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write);
+ fi->atomic_write_cnt = 0;
+}
+
static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
{
return atomic_read(&sbi->nr_pages[count_type]);
@@ -2612,11 +2643,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, !sbi->total_valid_block_count);
- f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+ if (unlikely(!sbi->total_valid_block_count ||
+ !sbi->total_valid_node_count)) {
+ f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u",
+ sbi->total_valid_block_count,
+ sbi->total_valid_node_count);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ } else {
+ sbi->total_valid_block_count--;
+ sbi->total_valid_node_count--;
+ }
- sbi->total_valid_node_count--;
- sbi->total_valid_block_count--;
if (sbi->reserved_blocks &&
sbi->current_reserved_blocks < sbi->reserved_blocks)
sbi->current_reserved_blocks++;
@@ -2661,6 +2698,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
pgoff_t index, bool for_write)
{
struct page *page;
+ unsigned int flags;
if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) {
if (!for_write)
@@ -2680,7 +2718,12 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
if (!for_write)
return grab_cache_page(mapping, index);
- return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+
+ flags = memalloc_nofs_save();
+ page = grab_cache_page_write_begin(mapping, index);
+ memalloc_nofs_restore(flags);
+
+ return page;
}
static inline struct page *f2fs_pagecache_get_page(
@@ -2695,16 +2738,6 @@ static inline struct page *f2fs_pagecache_get_page(
return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
}
-static inline void f2fs_copy_page(struct page *src, struct page *dst)
-{
- char *src_kaddr = kmap(src);
- char *dst_kaddr = kmap(dst);
-
- memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
- kunmap(dst);
- kunmap(src);
-}
-
static inline void f2fs_put_page(struct page *page, int unlock)
{
if (!page)
@@ -3173,6 +3206,10 @@ static inline int inline_xattr_size(struct inode *inode)
return 0;
}
+/*
+ * Notice: check inline_data flag without inode page lock is unsafe.
+ * It could change at any time by f2fs_convert_inline_page().
+ */
static inline int f2fs_has_inline_data(struct inode *inode)
{
return is_inode_flag_set(inode, FI_INLINE_DATA);
@@ -3203,14 +3240,9 @@ static inline bool f2fs_is_atomic_file(struct inode *inode)
return is_inode_flag_set(inode, FI_ATOMIC_FILE);
}
-static inline bool f2fs_is_commit_atomic_write(struct inode *inode)
+static inline bool f2fs_is_cow_file(struct inode *inode)
{
- return is_inode_flag_set(inode, FI_ATOMIC_COMMIT);
-}
-
-static inline bool f2fs_is_volatile_file(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_VOLATILE_FILE);
+ return is_inode_flag_set(inode, FI_COW_FILE);
}
static inline bool f2fs_is_first_block_written(struct inode *inode)
@@ -3445,6 +3477,8 @@ void f2fs_handle_failed_inode(struct inode *inode);
int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
bool hot, bool set);
struct dentry *f2fs_get_parent(struct dentry *child);
+int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct inode **new_inode);
/*
* dir.c
@@ -3580,11 +3614,8 @@ void f2fs_destroy_node_manager_caches(void);
* segment.c
*/
bool f2fs_need_SSR(struct f2fs_sb_info *sbi);
-void f2fs_register_inmem_page(struct inode *inode, struct page *page);
-void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure);
-void f2fs_drop_inmem_pages(struct inode *inode);
-void f2fs_drop_inmem_page(struct inode *inode, struct page *page);
-int f2fs_commit_inmem_pages(struct inode *inode);
+int f2fs_commit_atomic_write(struct inode *inode);
+void f2fs_abort_atomic_write(struct inode *inode, bool clean);
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg);
int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
@@ -3657,8 +3688,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
void f2fs_destroy_segment_manager_caches(void);
int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp);
unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
unsigned int segno);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
@@ -3728,6 +3757,7 @@ int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
struct bio *bio, enum page_type type);
+int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct page *page,
@@ -3748,7 +3778,7 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- int op_flags, bool for_write);
+ blk_opf_t op_flags, bool for_write);
struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
bool for_write);
@@ -3770,11 +3800,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
int compr_blocks, bool allow_balance);
void f2fs_write_failed(struct inode *inode, loff_t to);
void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
-int f2fs_release_page(struct page *page, gfp_t wait);
-#ifdef CONFIG_MIGRATION
-int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode);
-#endif
+bool f2fs_release_folio(struct folio *folio, gfp_t wait);
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
void f2fs_clear_page_cache_dirty_tag(struct page *page);
int f2fs_init_post_read_processing(void);
@@ -3789,8 +3815,7 @@ extern const struct iomap_ops f2fs_iomap_ops;
int f2fs_start_gc_thread(struct f2fs_sb_info *sbi);
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force,
- unsigned int segno);
+int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control);
void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count);
int __init f2fs_create_garbage_collection_cache(void);
@@ -3818,7 +3843,6 @@ struct f2fs_stat_info {
int ext_tree, zombie_tree, ext_node;
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
int ndirty_data, ndirty_qdata;
- int inmem_pages;
unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
int nats, dirty_nats, sits, dirty_sits;
int free_nids, avail_nids, alloc_nids;
@@ -3836,7 +3860,7 @@ struct f2fs_stat_info {
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
int compr_inode;
unsigned long long compr_blocks;
- int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
+ int aw_cnt, max_aw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -3848,7 +3872,6 @@ struct f2fs_stat_info {
int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
int bg_data_blks, bg_node_blks;
- unsigned long long skipped_atomic_files[2];
int curseg[NR_CURSEG_TYPE];
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
@@ -3948,17 +3971,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
} while (0)
-#define stat_inc_volatile_write(inode) \
- (atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
-#define stat_dec_volatile_write(inode) \
- (atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
-#define stat_update_max_volatile_write(inode) \
- do { \
- int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \
- int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \
- if (cur > max) \
- atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \
- } while (0)
#define stat_inc_seg_count(sbi, type, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
@@ -4020,9 +4032,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_add_compr_blocks(inode, blocks) do { } while (0)
#define stat_sub_compr_blocks(inode, blocks) do { } while (0)
#define stat_update_max_atomic_write(inode) do { } while (0)
-#define stat_inc_volatile_write(inode) do { } while (0)
-#define stat_dec_volatile_write(inode) do { } while (0)
-#define stat_update_max_volatile_write(inode) do { } while (0)
#define stat_inc_meta_count(sbi, blkaddr) do { } while (0)
#define stat_inc_seg_type(sbi, curseg) do { } while (0)
#define stat_inc_block_count(sbi, curseg) do { } while (0)
@@ -4055,6 +4064,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab;
* inline.c
*/
bool f2fs_may_inline_data(struct inode *inode);
+bool f2fs_sanity_check_inline_data(struct inode *inode);
bool f2fs_may_inline_dentry(struct inode *inode);
void f2fs_do_read_inline_data(struct page *page, struct page *ipage);
void f2fs_truncate_inline_inode(struct inode *inode,
@@ -4181,13 +4191,13 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
int f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
-void f2fs_decompress_cluster(struct decompress_io_ctx *dic);
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task);
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr);
+ block_t blkaddr, bool in_task);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
-bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
- int index, int nr_pages);
+bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages,
+ int index, int nr_pages, bool uptodate);
bool f2fs_sanity_check_cluster(struct dnode_of_data *dn);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -4202,8 +4212,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
-void f2fs_put_page_dic(struct page *page);
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_task);
+void f2fs_put_page_dic(struct page *page, bool in_task);
unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
@@ -4249,13 +4260,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
}
static inline int f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
-static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { }
+static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic,
+ bool in_task) { }
static inline void f2fs_end_read_compressed_page(struct page *page,
- bool failed, block_t blkaddr)
+ bool failed, block_t blkaddr, bool in_task)
{
WARN_ON_ONCE(1);
}
-static inline void f2fs_put_page_dic(struct page *page)
+static inline void f2fs_put_page_dic(struct page *page, bool in_task)
{
WARN_ON_ONCE(1);
}
@@ -4281,8 +4293,9 @@ static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
unsigned int c_len) { }
#endif
-static inline void set_compress_context(struct inode *inode)
+static inline int set_compress_context(struct inode *inode)
{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
F2FS_I(inode)->i_compress_algorithm =
@@ -4305,6 +4318,10 @@ static inline void set_compress_context(struct inode *inode)
stat_inc_compr_inode(inode);
inc_compr_inode_stat(inode);
f2fs_mark_inode_dirty_sync(inode, true);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
}
static inline bool f2fs_disable_compressed_file(struct inode *inode)
@@ -4381,8 +4398,7 @@ static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
static inline bool f2fs_bdev_support_discard(struct block_device *bdev)
{
- return blk_queue_discard(bdev_get_queue(bdev)) ||
- bdev_is_zoned(bdev);
+ return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev);
}
static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
@@ -4422,11 +4438,15 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
+static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
+{
+ return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW;
+}
+
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
- f2fs_is_atomic_file(inode) ||
- f2fs_is_volatile_file(inode))
+ f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode))
return false;
return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
}
@@ -4434,8 +4454,8 @@ static inline bool f2fs_may_compress(struct inode *inode)
static inline void f2fs_i_compr_blocks_update(struct inode *inode,
u64 blocks, bool add)
{
- int diff = F2FS_I(inode)->i_cluster_size - blocks;
struct f2fs_inode_info *fi = F2FS_I(inode);
+ int diff = fi->i_cluster_size - blocks;
/* don't update i_compr_blocks if saved blocks were released */
if (!add && !atomic_read(&fi->i_compr_blocks))
@@ -4451,17 +4471,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
f2fs_mark_inode_dirty_sync(inode, true);
}
-static inline int block_unaligned_IO(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
- unsigned int blocksize_mask = (1 << i_blkbits) - 1;
- loff_t offset = iocb->ki_pos;
- unsigned long align = offset | iov_iter_alignment(iter);
-
- return align & blocksize_mask;
-}
-
static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
int flag)
{
@@ -4472,40 +4481,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
return sbi->aligned_blksize;
}
-static inline bool f2fs_force_buffered_io(struct inode *inode,
- struct kiocb *iocb, struct iov_iter *iter)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- int rw = iov_iter_rw(iter);
-
- if (!fscrypt_dio_supported(iocb, iter))
- return true;
- if (fsverity_active(inode))
- return true;
- if (f2fs_compressed_file(inode))
- return true;
-
- /* disallow direct IO if any of devices has unaligned blksize */
- if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
- return true;
- /*
- * for blkzoned device, fallback direct IO to buffered IO, so
- * all IOs can be serialized by log-structured write.
- */
- if (f2fs_sb_has_blkzoned(sbi))
- return true;
- if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
- if (block_unaligned_IO(inode, iocb, iter))
- return true;
- if (F2FS_IO_ALIGNED(sbi))
- return true;
- }
- if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
- return true;
-
- return false;
-}
-
static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
{
return fsverity_active(inode) &&
@@ -4543,6 +4518,21 @@ static inline void f2fs_io_schedule_timeout(long timeout)
io_schedule_timeout(timeout);
}
+static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
+ enum page_type type)
+{
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
+ if (ofs == sbi->page_eio_ofs[type]) {
+ if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO)
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ } else {
+ sbi->page_eio_ofs[type] = ofs;
+ sbi->page_eio_cnt[type] = 0;
+ }
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5b89af0f27f0..791770507328 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -372,7 +372,8 @@ sync_nodes:
f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
clear_inode_flag(inode, FI_APPEND_WRITE);
flush_out:
- if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER)
+ if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ||
+ (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi)))
ret = f2fs_issue_flush(sbi, inode->i_ino);
if (!ret) {
f2fs_remove_ino_entry(sbi, ino, UPDATE_INO);
@@ -807,6 +808,29 @@ int f2fs_truncate(struct inode *inode)
return 0;
}
+static bool f2fs_force_buffered_io(struct inode *inode, int rw)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!fscrypt_dio_supported(inode))
+ return true;
+ if (fsverity_active(inode))
+ return true;
+ if (f2fs_compressed_file(inode))
+ return true;
+
+ /* disallow direct IO if any of devices has unaligned blksize */
+ if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
+ return true;
+
+ if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
+ return true;
+ if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ return true;
+
+ return false;
+}
+
int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -823,6 +847,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
}
+ /*
+ * Return the DIO alignment restrictions if requested. We only return
+ * this information when requested, since on encrypted files it might
+ * take a fair bit of work to get if the file wasn't opened recently.
+ *
+ * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN
+ * cannot represent that, so in that case we report no DIO support.
+ */
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+ unsigned int bsize = i_blocksize(inode);
+
+ stat->result_mask |= STATX_DIOALIGN;
+ if (!f2fs_force_buffered_io(inode, WRITE)) {
+ stat->dio_mem_align = bsize;
+ stat->dio_offset_align = bsize;
+ }
+ }
+
flags = fi->i_flags;
if (flags & F2FS_COMPR_FL)
stat->attributes |= STATX_ATTR_COMPRESSED;
@@ -860,10 +902,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_MTIME)
@@ -916,17 +956,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (err)
return err;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
err = f2fs_dquot_initialize(inode);
if (err)
return err;
}
- if ((attr->ia_valid & ATTR_UID &&
- !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (attr->ia_valid & ATTR_GID &&
- !gid_eq(attr->ia_gid, inode->i_gid))) {
+ if (i_uid_needs_update(mnt_userns, attr, inode) ||
+ i_gid_needs_update(mnt_userns, attr, inode)) {
f2fs_lock_op(F2FS_I_SB(inode));
- err = dquot_transfer(inode, attr);
+ err = dquot_transfer(mnt_userns, inode, attr);
if (err) {
set_sbi_flag(F2FS_I_SB(inode),
SBI_QUOTA_NEED_REPAIR);
@@ -937,10 +975,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* update uid/gid under lock_op(), so that dquot and inode can
* be updated atomically.
*/
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
f2fs_mark_inode_dirty_sync(inode, true);
f2fs_unlock_op(F2FS_I_SB(inode));
}
@@ -1277,7 +1313,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
f2fs_put_page(psrc, 1);
return PTR_ERR(pdst);
}
- f2fs_copy_page(psrc, pdst);
+ memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE);
set_page_dirty(pdst);
f2fs_put_page(pdst, 1);
f2fs_put_page(psrc, 1);
@@ -1437,11 +1473,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
ret = -ENOSPC;
break;
}
- if (dn->data_blkaddr != NEW_ADDR) {
- f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
- dn->data_blkaddr = NEW_ADDR;
- f2fs_set_data_blkaddr(dn);
+
+ if (dn->data_blkaddr == NEW_ADDR)
+ continue;
+
+ if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
+ DATA_GENERIC_ENHANCE)) {
+ ret = -EFSCORRUPTED;
+ break;
}
+
+ f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
+ dn->data_blkaddr = NEW_ADDR;
+ f2fs_set_data_blkaddr(dn);
}
f2fs_update_extent_cache_range(dn, start, 0, index - start);
@@ -1638,6 +1682,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
struct f2fs_map_blocks map = { .m_next_pgofs = NULL,
.m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE,
.m_may_create = true };
+ struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO,
+ .init_gc_type = FG_GC,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = true,
+ .nr_free_secs = 0 };
pgoff_t pg_start, pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_end;
@@ -1667,7 +1716,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
return 0;
if (f2fs_is_pinned_file(inode)) {
- block_t sec_blks = BLKS_PER_SEC(sbi);
+ block_t sec_blks = CAP_BLKS_PER_SEC(sbi);
block_t sec_len = roundup(map.m_len, sec_blks);
map.m_len = sec_blks;
@@ -1675,8 +1724,8 @@ next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
f2fs_down_write(&sbi->gc_lock);
- err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
- if (err && err != -ENODATA && err != -EAGAIN)
+ err = f2fs_gc(sbi, &gc_control);
+ if (err && err != -ENODATA)
goto out_err;
}
@@ -1766,6 +1815,10 @@ static long f2fs_fallocate(struct file *file, int mode,
inode_lock(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
if (offset >= inode->i_size)
goto out;
@@ -1804,16 +1857,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
atomic_read(&inode->i_writecount) != 1)
return 0;
- /* some remained atomic pages should discarded */
- if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
- if (f2fs_is_volatile_file(inode)) {
- set_inode_flag(inode, FI_DROP_CACHE);
- filemap_fdatawrite(inode->i_mapping);
- clear_inode_flag(inode, FI_DROP_CACHE);
- clear_inode_flag(inode, FI_VOLATILE_FILE);
- stat_dec_volatile_write(inode);
- }
+ f2fs_abort_atomic_write(inode, true);
return 0;
}
@@ -1827,9 +1871,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
* until all the writers close its file. Since this should be done
* before dropping file lock, it needs to do in ->flush.
*/
- if (f2fs_is_atomic_file(inode) &&
- F2FS_I(inode)->inmem_task == current)
- f2fs_drop_inmem_pages(inode);
+ if (F2FS_I(inode)->atomic_write_task == current)
+ f2fs_abort_atomic_write(inode, true);
return 0;
}
@@ -1863,22 +1906,15 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
if (masked_flags & F2FS_COMPR_FL) {
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
- }
- if (iflags & F2FS_NOCOMP_FL)
- return -EINVAL;
- if (iflags & F2FS_COMPR_FL) {
+ } else {
if (!f2fs_may_compress(inode))
return -EINVAL;
- if (S_ISREG(inode->i_mode) && inode->i_size)
+ if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
return -EINVAL;
-
- set_compress_context(inode);
+ if (set_compress_context(inode))
+ return -EOPNOTSUPP;
}
}
- if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) {
- if (masked_flags & F2FS_COMPR_FL)
- return -EINVAL;
- }
fi->i_flags = iflags | (fi->i_flags & ~mask);
f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) &&
@@ -1992,6 +2028,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct inode *pinode;
int ret;
if (!inode_owner_or_capable(mnt_userns, inode))
@@ -2014,45 +2051,57 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
goto out;
}
- if (f2fs_is_atomic_file(inode)) {
- if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
- ret = -EINVAL;
+ if (f2fs_is_atomic_file(inode))
goto out;
- }
ret = f2fs_convert_inline_inode(inode);
if (ret)
goto out;
- f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
* f2fs_is_atomic_file.
*/
if (get_dirty_pages(inode))
- f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u",
+ f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret) {
- f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
goto out;
}
+ /* Create a COW inode for atomic write */
+ pinode = f2fs_iget(inode->i_sb, fi->i_pino);
+ if (IS_ERR(pinode)) {
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ ret = PTR_ERR(pinode);
+ goto out;
+ }
+
+ ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode);
+ iput(pinode);
+ if (ret) {
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ goto out;
+ }
+ f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
+
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (list_empty(&fi->inmem_ilist))
- list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
sbi->atomic_files++;
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- /* add inode in inmem_list first and set atomic_file */
set_inode_flag(inode, FI_ATOMIC_FILE);
- clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ set_inode_flag(fi->cow_inode, FI_COW_FILE);
+ clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- F2FS_I(inode)->inmem_task = current;
+ f2fs_update_time(sbi, REQ_TIME);
+ fi->atomic_write_task = current;
stat_update_max_atomic_write(inode);
+ fi->atomic_write_cnt = 0;
out:
inode_unlock(inode);
mnt_drop_write_file(filp);
@@ -2076,99 +2125,24 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
inode_lock(inode);
- if (f2fs_is_volatile_file(inode)) {
- ret = -EINVAL;
- goto err_out;
- }
-
if (f2fs_is_atomic_file(inode)) {
- ret = f2fs_commit_inmem_pages(inode);
+ ret = f2fs_commit_atomic_write(inode);
if (ret)
- goto err_out;
+ goto unlock_out;
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
if (!ret)
- f2fs_drop_inmem_pages(inode);
+ f2fs_abort_atomic_write(inode, false);
} else {
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
-err_out:
- if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
- clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- ret = -EINVAL;
- }
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return ret;
-}
-
-static int f2fs_ioc_start_volatile_write(struct file *filp)
-{
- struct inode *inode = file_inode(filp);
- struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
- int ret;
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EACCES;
-
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- if (f2fs_is_volatile_file(inode))
- goto out;
-
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- goto out;
-
- stat_inc_volatile_write(inode);
- stat_update_max_volatile_write(inode);
-
- set_inode_flag(inode, FI_VOLATILE_FILE);
- f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return ret;
-}
-
-static int f2fs_ioc_release_volatile_write(struct file *filp)
-{
- struct inode *inode = file_inode(filp);
- struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
- int ret;
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EACCES;
-
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- if (!f2fs_is_volatile_file(inode))
- goto out;
-
- if (!f2fs_is_first_block_written(inode)) {
- ret = truncate_partial_data_page(inode, 0, true);
- goto out;
- }
-
- ret = punch_hole(inode, 0, F2FS_BLKSIZE);
-out:
+unlock_out:
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
}
-static int f2fs_ioc_abort_volatile_write(struct file *filp)
+static int f2fs_ioc_abort_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
@@ -2183,15 +2157,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
inode_lock(inode);
- if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
- if (f2fs_is_volatile_file(inode)) {
- clear_inode_flag(inode, FI_VOLATILE_FILE);
- stat_dec_volatile_write(inode);
- ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
- }
-
- clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
+ f2fs_abort_atomic_write(inode, true);
inode_unlock(inode);
@@ -2285,7 +2251,6 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret;
@@ -2304,7 +2269,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
return ret;
range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
ret = f2fs_trim_fs(F2FS_SB(sb), &range);
mnt_drop_write_file(filp);
if (ret < 0)
@@ -2438,6 +2403,10 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO,
+ .no_bg_gc = false,
+ .should_migrate_blocks = false,
+ .nr_free_secs = 0 };
__u32 sync;
int ret;
@@ -2463,7 +2432,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
f2fs_down_write(&sbi->gc_lock);
}
- ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
+ gc_control.init_gc_type = sync ? FG_GC : BG_GC;
+ gc_control.err_gc_skipped = sync;
+ ret = f2fs_gc(sbi, &gc_control);
out:
mnt_drop_write_file(filp);
return ret;
@@ -2472,6 +2443,12 @@ out:
static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
+ struct f2fs_gc_control gc_control = {
+ .init_gc_type = range->sync ? FG_GC : BG_GC,
+ .no_bg_gc = false,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = range->sync,
+ .nr_free_secs = 0 };
u64 end;
int ret;
@@ -2499,14 +2476,14 @@ do_more:
f2fs_down_write(&sbi->gc_lock);
}
- ret = f2fs_gc(sbi, range->sync, true, false,
- GET_SEGNO(sbi, range->start));
+ gc_control.victim_segno = GET_SEGNO(sbi, range->start);
+ ret = f2fs_gc(sbi, &gc_control);
if (ret) {
if (ret == -EBUSY)
ret = -EAGAIN;
goto out;
}
- range->start += BLKS_PER_SEC(sbi);
+ range->start += CAP_BLKS_PER_SEC(sbi);
if (range->start <= end)
goto do_more;
out:
@@ -2631,7 +2608,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
goto out;
}
- sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi));
+ sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi));
/*
* make sure there are enough free section for LFS allocation, this can
@@ -2675,6 +2652,7 @@ do_map:
}
set_page_dirty(page);
+ set_page_private_gcing(page);
f2fs_put_page(page, 1);
idx++;
@@ -2914,6 +2892,11 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
unsigned int start_segno = 0, end_segno = 0;
unsigned int dev_start_segno = 0, dev_end_segno = 0;
struct f2fs_flush_device range;
+ struct f2fs_gc_control gc_control = {
+ .init_gc_type = FG_GC,
+ .should_migrate_blocks = true,
+ .err_gc_skipped = true,
+ .nr_free_secs = 0 };
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -2957,7 +2940,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
sm->last_victim[GC_CB] = end_segno + 1;
sm->last_victim[GC_GREEDY] = end_segno + 1;
sm->last_victim[ALLOC_NEXT] = end_segno + 1;
- ret = f2fs_gc(sbi, true, true, true, start_segno);
+
+ gc_control.victim_segno = start_segno;
+ ret = f2fs_gc(sbi, &gc_control);
if (ret == -EAGAIN)
ret = 0;
else if (ret < 0)
@@ -3018,7 +3003,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
- if (projid_eq(kprojid, F2FS_I(inode)->i_projid))
+ if (projid_eq(kprojid, fi->i_projid))
return 0;
err = -EPERM;
@@ -3038,7 +3023,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
if (err)
goto out_unlock;
- F2FS_I(inode)->i_projid = kprojid;
+ fi->i_projid = kprojid;
inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode, true);
out_unlock:
@@ -3686,18 +3671,18 @@ out:
static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode,
pgoff_t off, block_t block, block_t len, u32 flags)
{
- struct request_queue *q = bdev_get_queue(bdev);
sector_t sector = SECTOR_FROM_BLOCK(block);
sector_t nr_sects = SECTOR_FROM_BLOCK(len);
int ret = 0;
- if (!q)
- return -ENXIO;
-
- if (flags & F2FS_TRIM_FILE_DISCARD)
- ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
- blk_queue_secure_erase(q) ?
- BLKDEV_DISCARD_SECURE : 0);
+ if (flags & F2FS_TRIM_FILE_DISCARD) {
+ if (bdev_max_secure_erase_sectors(bdev))
+ ret = blkdev_issue_secure_erase(bdev, sector, nr_sects,
+ GFP_NOFS);
+ else
+ ret = blkdev_issue_discard(bdev, sector, nr_sects,
+ GFP_NOFS);
+ }
if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) {
if (IS_ENCRYPTED(inode))
@@ -3969,10 +3954,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
for (i = 0; i < page_len; i++, redirty_idx++) {
page = find_lock_page(mapping, redirty_idx);
- if (!page) {
- ret = -ENOMEM;
- break;
- }
+
+ /* It will never fail, when page has pinned above */
+ f2fs_bug_on(F2FS_I_SB(inode), !page);
+
set_page_dirty(page);
f2fs_put_page(page, 1);
f2fs_put_page(page, 0);
@@ -3988,7 +3973,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
struct f2fs_inode_info *fi = F2FS_I(inode);
pgoff_t page_idx = 0, last_idx;
unsigned int blk_per_seg = sbi->blocks_per_seg;
- int cluster_size = F2FS_I(inode)->i_cluster_size;
+ int cluster_size = fi->i_cluster_size;
int count, ret;
if (!f2fs_sb_has_compression(sbi) ||
@@ -4011,8 +3996,8 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
goto out;
}
- if (f2fs_is_mmap_file(inode)) {
- ret = -EBUSY;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ ret = -EINVAL;
goto out;
}
@@ -4083,8 +4068,8 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg)
goto out;
}
- if (f2fs_is_mmap_file(inode)) {
- ret = -EBUSY;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ ret = -EINVAL;
goto out;
}
@@ -4136,12 +4121,11 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_start_atomic_write(filp);
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
+ case F2FS_IOC_ABORT_ATOMIC_WRITE:
+ return f2fs_ioc_abort_atomic_write(filp);
case F2FS_IOC_START_VOLATILE_WRITE:
- return f2fs_ioc_start_volatile_write(filp);
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
- return f2fs_ioc_release_volatile_write(filp);
- case F2FS_IOC_ABORT_VOLATILE_WRITE:
- return f2fs_ioc_abort_volatile_write(filp);
+ return -EOPNOTSUPP;
case F2FS_IOC_SHUTDOWN:
return f2fs_ioc_shutdown(filp, arg);
case FITRIM:
@@ -4239,7 +4223,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
if (!(iocb->ki_flags & IOCB_DIRECT))
return false;
- if (f2fs_force_buffered_io(inode, iocb, iter))
+ if (f2fs_force_buffered_io(inode, iov_iter_rw(iter)))
return false;
/*
@@ -4309,7 +4293,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
*/
inc_page_count(sbi, F2FS_DIO_READ);
dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
- &f2fs_iomap_dio_read_ops, 0, 0);
+ &f2fs_iomap_dio_read_ops, 0, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret != -EIOCBQUEUED)
@@ -4329,17 +4313,39 @@ out:
static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ const loff_t pos = iocb->ki_pos;
ssize_t ret;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- if (f2fs_should_use_dio(inode, iocb, to))
- return f2fs_dio_read_iter(iocb, to);
+ if (trace_f2fs_dataread_start_enabled()) {
+ char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL);
+ char *path;
+
+ if (!p)
+ goto skip_read_trace;
+
+ path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX);
+ if (IS_ERR(path)) {
+ kfree(p);
+ goto skip_read_trace;
+ }
- ret = filemap_read(iocb, to, 0);
- if (ret > 0)
- f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
+ trace_f2fs_dataread_start(inode, pos, iov_iter_count(to),
+ current->pid, path, current->comm);
+ kfree(p);
+ }
+skip_read_trace:
+ if (f2fs_should_use_dio(inode, iocb, to)) {
+ ret = f2fs_dio_read_iter(iocb, to);
+ } else {
+ ret = filemap_read(iocb, to, 0);
+ if (ret > 0)
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret);
+ }
+ if (trace_f2fs_dataread_end_enabled())
+ trace_f2fs_dataread_end(inode, pos, ret);
return ret;
}
@@ -4527,7 +4533,7 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (pos + count > inode->i_size)
dio_flags |= IOMAP_DIO_FORCE_WAIT;
dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
- &f2fs_iomap_dio_write_ops, dio_flags, 0);
+ &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret == -ENOTBLK)
@@ -4631,14 +4637,36 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* Possibly preallocate the blocks for the write. */
target_size = iocb->ki_pos + iov_iter_count(from);
preallocated = f2fs_preallocate_blocks(iocb, from, dio);
- if (preallocated < 0)
+ if (preallocated < 0) {
ret = preallocated;
- else
+ } else {
+ if (trace_f2fs_datawrite_start_enabled()) {
+ char *p = f2fs_kmalloc(F2FS_I_SB(inode),
+ PATH_MAX, GFP_KERNEL);
+ char *path;
+
+ if (!p)
+ goto skip_write_trace;
+ path = dentry_path_raw(file_dentry(iocb->ki_filp),
+ p, PATH_MAX);
+ if (IS_ERR(path)) {
+ kfree(p);
+ goto skip_write_trace;
+ }
+ trace_f2fs_datawrite_start(inode, orig_pos, orig_count,
+ current->pid, path, current->comm);
+ kfree(p);
+ }
+skip_write_trace:
/* Do the actual write. */
ret = dio ?
f2fs_dio_write_iter(iocb, from, &may_need_sync):
f2fs_buffered_write_iter(iocb, from);
+ if (trace_f2fs_datawrite_end_enabled())
+ trace_f2fs_datawrite_end(inode, orig_pos, ret);
+ }
+
/* Don't leave any preallocated blocks around past i_size. */
if (preallocated && i_size_read(inode) < target_size) {
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -4765,7 +4793,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
case F2FS_IOC_START_VOLATILE_WRITE:
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
- case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ case F2FS_IOC_ABORT_ATOMIC_WRITE:
case F2FS_IOC_SHUTDOWN:
case FITRIM:
case FS_IOC_SET_ENCRYPTION_POLICY:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ea5b93b689cd..6da21d405ce1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -35,6 +35,10 @@ static int gc_thread_func(void *data)
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq;
unsigned int wait_ms;
+ struct f2fs_gc_control gc_control = {
+ .victim_segno = NULL_SEGNO,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = false };
wait_ms = gc_th->min_sleep_time;
@@ -141,9 +145,16 @@ do_gc:
if (foreground)
sync_mode = false;
+ gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC;
+ gc_control.no_bg_gc = foreground;
+ gc_control.nr_free_secs = foreground ? 1 : 0;
+
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO))
- wait_ms = gc_th->no_gc_sleep_time;
+ if (f2fs_gc(sbi, &gc_control)) {
+ /* don't bother wait_ms by foreground gc */
+ if (!foreground)
+ wait_ms = gc_th->no_gc_sleep_time;
+ }
if (foreground)
wake_up_all(&gc_th->fggc_wq);
@@ -479,7 +490,7 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi,
unsigned long long age, u, accu;
unsigned long long max_mtime = sit_i->dirty_max_mtime;
unsigned long long min_mtime = sit_i->dirty_min_mtime;
- unsigned int sec_blocks = BLKS_PER_SEC(sbi);
+ unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi);
unsigned int vblocks;
unsigned int dirty_threshold = max(am->max_candidate_count,
am->candidate_ratio *
@@ -646,6 +657,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi)
f2fs_bug_on(sbi, !list_empty(&am->victim_list));
}
+static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
+ if (!dirty_i->enable_pin_section)
+ return false;
+ if (!test_and_set_bit(secno, dirty_i->pinned_secmap))
+ dirty_i->pinned_secmap_cnt++;
+ return true;
+}
+
+static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i)
+{
+ return dirty_i->pinned_secmap_cnt;
+}
+
+static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i,
+ unsigned int secno)
+{
+ return dirty_i->enable_pin_section &&
+ f2fs_pinned_section_exists(dirty_i) &&
+ test_bit(secno, dirty_i->pinned_secmap);
+}
+
+static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable)
+{
+ unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
+
+ if (f2fs_pinned_section_exists(DIRTY_I(sbi))) {
+ memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size);
+ DIRTY_I(sbi)->pinned_secmap_cnt = 0;
+ }
+ DIRTY_I(sbi)->enable_pin_section = enable;
+}
+
+static int f2fs_gc_pinned_control(struct inode *inode, int gc_type,
+ unsigned int segno)
+{
+ if (!f2fs_is_pinned_file(inode))
+ return 0;
+ if (gc_type != FG_GC)
+ return -EBUSY;
+ if (!f2fs_pin_section(F2FS_I_SB(inode), segno))
+ f2fs_pin_file_control(inode, true);
+ return -EAGAIN;
+}
+
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
@@ -787,6 +846,9 @@ retry:
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
goto next;
+ if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno))
+ goto next;
+
if (is_atgc) {
add_victim_entry(sbi, &p, segno);
goto next;
@@ -1194,18 +1256,9 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto out;
}
- if (f2fs_is_atomic_file(inode)) {
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
- F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
- err = -EAGAIN;
- goto out;
- }
-
- if (f2fs_is_pinned_file(inode)) {
- f2fs_pin_file_control(inode, true);
- err = -EAGAIN;
+ err = f2fs_gc_pinned_control(inode, gc_type, segno);
+ if (err)
goto out;
- }
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
@@ -1344,18 +1397,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
}
- if (f2fs_is_atomic_file(inode)) {
- F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
- F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
- err = -EAGAIN;
- goto out;
- }
- if (f2fs_is_pinned_file(inode)) {
- if (gc_type == FG_GC)
- f2fs_pin_file_control(inode, true);
- err = -EAGAIN;
+ err = f2fs_gc_pinned_control(inode, gc_type, segno);
+ if (err)
goto out;
- }
if (gc_type == BG_GC) {
if (PageWriteback(page)) {
@@ -1446,7 +1490,7 @@ next_step:
*/
if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
(!force_migrate && get_valid_blocks(sbi, segno, true) ==
- BLKS_PER_SEC(sbi)))
+ CAP_BLKS_PER_SEC(sbi)))
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
@@ -1475,11 +1519,19 @@ next_step:
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 3) {
+ int err;
+
inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode) || is_bad_inode(inode) ||
special_file(inode->i_mode))
continue;
+ err = f2fs_gc_pinned_control(inode, gc_type, segno);
+ if (err == -EAGAIN) {
+ iput(inode);
+ return submitted;
+ }
+
if (!f2fs_down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
iput(inode);
@@ -1699,23 +1751,21 @@ skip:
return seg_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
- bool background, bool force, unsigned int segno)
+int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control)
{
- int gc_type = sync ? FG_GC : BG_GC;
+ int gc_type = gc_control->init_gc_type;
+ unsigned int segno = gc_control->victim_segno;
int sec_freed = 0, seg_freed = 0, total_freed = 0;
int ret = 0;
struct cp_control cpc;
- unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
- unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
- unsigned long long first_skipped;
unsigned int skipped_round = 0, round = 0;
- trace_f2fs_gc_begin(sbi->sb, sync, background,
+ trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc,
+ gc_control->nr_free_secs,
get_pages(sbi, F2FS_DIRTY_NODES),
get_pages(sbi, F2FS_DIRTY_DENTS),
get_pages(sbi, F2FS_DIRTY_IMETA),
@@ -1726,7 +1776,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
cpc.reason = __get_cp_reason(sbi);
sbi->skipped_gc_rwsem = 0;
- first_skipped = last_skipped;
gc_more:
if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
ret = -EINVAL;
@@ -1743,8 +1792,7 @@ gc_more:
* threshold, we can make them free by checkpoint. Then, we
* secure free segments which doesn't need fggc any more.
*/
- if (prefree_segments(sbi) &&
- !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
+ if (prefree_segments(sbi)) {
ret = f2fs_write_checkpoint(sbi, &cpc);
if (ret)
goto stop;
@@ -1754,54 +1802,69 @@ gc_more:
}
/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
- if (gc_type == BG_GC && !background) {
+ if (gc_type == BG_GC && gc_control->no_bg_gc) {
ret = -EINVAL;
goto stop;
}
+retry:
ret = __get_victim(sbi, &segno, gc_type);
- if (ret)
+ if (ret) {
+ /* allow to search victim from sections has pinned data */
+ if (ret == -ENODATA && gc_type == FG_GC &&
+ f2fs_pinned_section_exists(DIRTY_I(sbi))) {
+ f2fs_unpin_all_sections(sbi, false);
+ goto retry;
+ }
goto stop;
+ }
- seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force);
- if (gc_type == FG_GC &&
- seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
- sec_freed++;
+ seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type,
+ gc_control->should_migrate_blocks);
total_freed += seg_freed;
- if (gc_type == FG_GC) {
- if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
- sbi->skipped_gc_rwsem)
- skipped_round++;
- last_skipped = sbi->skipped_atomic_files[FG_GC];
- round++;
- }
+ if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
+ sec_freed++;
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
- if (sync)
+ if (gc_control->init_gc_type == FG_GC ||
+ !has_not_enough_free_secs(sbi,
+ (gc_type == FG_GC) ? sec_freed : 0, 0)) {
+ if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs)
+ goto go_gc_more;
goto stop;
+ }
- if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
- if (skipped_round <= MAX_SKIP_GC_COUNT ||
- skipped_round * 2 < round) {
- segno = NULL_SEGNO;
- goto gc_more;
+ /* FG_GC stops GC by skip_count */
+ if (gc_type == FG_GC) {
+ if (sbi->skipped_gc_rwsem)
+ skipped_round++;
+ round++;
+ if (skipped_round > MAX_SKIP_GC_COUNT &&
+ skipped_round * 2 >= round) {
+ ret = f2fs_write_checkpoint(sbi, &cpc);
+ goto stop;
}
+ }
- if (first_skipped < last_skipped &&
- (last_skipped - first_skipped) >
- sbi->skipped_gc_rwsem) {
- f2fs_drop_inmem_pages_all(sbi, true);
- segno = NULL_SEGNO;
- goto gc_more;
- }
- if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
- ret = f2fs_write_checkpoint(sbi, &cpc);
+ /* Write checkpoint to reclaim prefree segments */
+ if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE &&
+ prefree_segments(sbi)) {
+ ret = f2fs_write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
}
+go_gc_more:
+ segno = NULL_SEGNO;
+ goto gc_more;
+
stop:
SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
- SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
+ SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno;
+
+ if (gc_type == FG_GC)
+ f2fs_unpin_all_sections(sbi, true);
trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
get_pages(sbi, F2FS_DIRTY_NODES),
@@ -1816,7 +1879,7 @@ stop:
put_gc_inode(&gc_list);
- if (sync && !ret)
+ if (gc_control->err_gc_skipped && !ret)
ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 3fe145e8e594..19b956c2d697 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -120,15 +120,13 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
return free_blks - ovp_blks;
}
-static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
+static inline block_t limit_invalid_user_blocks(block_t user_block_count)
{
- return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
+ return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100;
}
-static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
+static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks)
{
- block_t reclaimable_user_blocks = sbi->user_block_count -
- written_block_count(sbi);
return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
}
@@ -163,15 +161,16 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
{
- block_t invalid_user_blocks = sbi->user_block_count -
- written_block_count(sbi);
+ block_t user_block_count = sbi->user_block_count;
+ block_t invalid_user_blocks = user_block_count -
+ written_block_count(sbi);
/*
* Background GC is triggered with the following conditions.
* 1. There are a number of invalid blocks.
* 2. There is not enough free space.
*/
- if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
- free_user_blocks(sbi) < limit_free_user_blocks(sbi))
- return true;
- return false;
+ return (invalid_user_blocks >
+ limit_invalid_user_blocks(user_block_count) &&
+ free_user_blocks(sbi) <
+ limit_free_user_blocks(invalid_user_blocks));
}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 3cb1e7a24740..049ce50cec9b 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -91,7 +91,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len)
/*
* Compute @fname->hash. For all directories, @fname->disk_name must be set.
* For casefolded directories, @fname->usr_fname must be set, and also
- * @fname->cf_name if the filename is valid Unicode.
+ * @fname->cf_name if the filename is valid Unicode and is not "." or "..".
*/
void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
{
@@ -110,10 +110,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
/*
* If the casefolded name is provided, hash it instead of the
* on-disk name. If the casefolded name is *not* provided, that
- * should only be because the name wasn't valid Unicode, so fall
- * back to treating the name as an opaque byte sequence. Note
- * that to handle encrypted directories, the fallback must use
- * usr_fname (plaintext) rather than disk_name (ciphertext).
+ * should only be because the name wasn't valid Unicode or was
+ * "." or "..", so fall back to treating the name as an opaque
+ * byte sequence. Note that to handle encrypted directories,
+ * the fallback must use usr_fname (plaintext) rather than
+ * disk_name (ciphertext).
*/
WARN_ON_ONCE(!fname->usr_fname->name);
if (fname->cf_name.name) {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a578bf83b803..bf46a7dfbea2 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -14,21 +14,40 @@
#include "node.h"
#include <trace/events/f2fs.h>
-bool f2fs_may_inline_data(struct inode *inode)
+static bool support_inline_data(struct inode *inode)
{
if (f2fs_is_atomic_file(inode))
return false;
-
if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
-
if (i_size_read(inode) > MAX_INLINE_DATA(inode))
return false;
+ return true;
+}
- if (f2fs_post_read_required(inode))
+bool f2fs_may_inline_data(struct inode *inode)
+{
+ if (!support_inline_data(inode))
return false;
- return true;
+ return !f2fs_post_read_required(inode);
+}
+
+bool f2fs_sanity_check_inline_data(struct inode *inode)
+{
+ if (!f2fs_has_inline_data(inode))
+ return false;
+
+ if (!support_inline_data(inode))
+ return true;
+
+ /*
+ * used by sanity_check_inode(), when disk layout fields has not
+ * been synchronized to inmem fields.
+ */
+ return (S_ISREG(inode->i_mode) &&
+ (file_is_encrypt(inode) || file_is_verity(inode) ||
+ (F2FS_I(inode)->i_flags & F2FS_COMPR_FL)));
}
bool f2fs_may_inline_dentry(struct inode *inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 71f232dcf3c2..6d11c365d7b4 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
- if (F2FS_I(inode)->extent_tree) {
- struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest;
+ if (fi->extent_tree) {
+ struct extent_info *ei = &fi->extent_tree->largest;
if (ei->len &&
(!f2fs_is_valid_blkaddr(sbi, ei->blk,
@@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
}
- if (f2fs_has_inline_data(inode) &&
- (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) {
+ if (f2fs_sanity_check_inline_data(inode)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
__func__, inode->i_ino, inode->i_mode);
@@ -466,10 +465,10 @@ static int do_read_inode(struct inode *inode)
}
}
- F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
- F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
- F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
- F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
+ fi->i_disk_time[0] = inode->i_atime;
+ fi->i_disk_time[1] = inode->i_ctime;
+ fi->i_disk_time[2] = inode->i_mtime;
+ fi->i_disk_time[3] = fi->i_crtime;
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -550,7 +549,8 @@ make_now:
}
f2fs_set_inode_flags(inode);
- if (file_should_truncate(inode)) {
+ if (file_should_truncate(inode) &&
+ !is_sbi_flag_set(sbi, SBI_POR_DOING)) {
ret = f2fs_truncate(inode);
if (ret)
goto bad_inode;
@@ -744,9 +744,7 @@ void f2fs_evict_inode(struct inode *inode)
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int err = 0;
- /* some remained atomic pages should discarded */
- if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
+ f2fs_abort_atomic_write(inode, true);
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -795,8 +793,22 @@ retry:
f2fs_lock_op(sbi);
err = f2fs_remove_inode_page(inode);
f2fs_unlock_op(sbi);
- if (err == -ENOENT)
+ if (err == -ENOENT) {
err = 0;
+
+ /*
+ * in fuzzed image, another node may has the same
+ * block address as inode's, if it was truncated
+ * previously, truncation of inode node will fail.
+ */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ f2fs_warn(F2FS_I_SB(inode),
+ "f2fs_evict_inode: inconsistent node id, ino:%lu",
+ inode->i_ino);
+ f2fs_inode_synced(inode);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
+ }
}
/* give more chances, if ENOMEM case */
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index be599f31d3c4..d84c5f6cc09d 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
unsigned int cnt;
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+ unsigned long flags;
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
for (io = 0; io < NR_PAGE_TYPE; io++) {
cnt = io_lat->bio_cnt[idx][io];
@@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
io_lat->bio_cnt[idx][io] = 0;
}
}
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
trace_f2fs_iostat_latency(sbi, iostat_lat);
}
@@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
{
unsigned long long iostat_diff[NR_IO_TYPE];
int i;
+ unsigned long flags;
if (time_is_after_jiffies(sbi->iostat_next_period))
return;
/* Need double check under the lock */
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
if (time_is_after_jiffies(sbi->iostat_next_period)) {
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
return;
}
sbi->iostat_next_period = jiffies +
@@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
sbi->prev_rw_iostat[i];
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
trace_f2fs_iostat(sbi, iostat_diff);
@@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int i;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irq(&sbi->iostat_lock);
for (i = 0; i < NR_IO_TYPE; i++) {
sbi->rw_iostat[i] = 0;
sbi->prev_rw_iostat[i] = 0;
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irq(&sbi->iostat_lock);
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irq(&sbi->iostat_lat_lock);
memset(io_lat, 0, sizeof(struct iostat_lat_info));
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irq(&sbi->iostat_lat_lock);
}
void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes)
{
+ unsigned long flags;
+
if (!sbi->iostat_enable)
return;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
sbi->rw_iostat[type] += io_bytes;
if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO)
@@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,
if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
sbi->rw_iostat[APP_READ_IO] += io_bytes;
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
f2fs_record_iostat(sbi);
}
@@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int idx;
+ unsigned long flags;
if (!sbi->iostat_enable)
return;
@@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
idx = WRITE_ASYNC_IO;
}
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
io_lat->sum_lat[idx][iotype] += ts_diff;
io_lat->bio_cnt[idx][iotype]++;
if (ts_diff > io_lat->peak_lat[idx][iotype])
io_lat->peak_lat[idx][iotype] = ts_diff;
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
}
void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 5ed79b29999f..bf00d5057abb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -37,13 +37,10 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
if (!inode)
return ERR_PTR(-ENOMEM);
- f2fs_lock_op(sbi);
if (!f2fs_alloc_nid(sbi, &ino)) {
- f2fs_unlock_op(sbi);
err = -ENOSPC;
goto fail;
}
- f2fs_unlock_op(sbi);
nid_free = true;
@@ -92,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
if (test_opt(sbi, INLINE_XATTR))
set_inode_flag(inode, FI_INLINE_XATTR);
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(inode, FI_INLINE_DENTRY);
@@ -110,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
f2fs_init_extent_tree(inode, NULL);
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
F2FS_I(inode)->i_flags =
f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
@@ -130,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
set_compress_context(inode);
}
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
f2fs_set_inode_flags(inode);
trace_f2fs_new_inode(inode, 0);
@@ -328,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
if (!is_extension_exist(name, ext[i], false))
continue;
+ /* Do not use inline_data with compression */
+ stat_dec_inline_inode(inode);
+ clear_inode_flag(inode, FI_INLINE_DATA);
set_compress_context(inode);
return;
}
@@ -461,6 +463,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
return 0;
}
+ if (!S_ISDIR(dir->i_mode)) {
+ f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)",
+ dir->i_ino, dir->i_mode, pino);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ return -ENOTDIR;
+ }
+
err = f2fs_dquot_initialize(dir);
if (err)
return err;
@@ -836,8 +845,8 @@ out:
}
static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode,
- struct inode **whiteout)
+ struct dentry *dentry, umode_t mode, bool is_whiteout,
+ struct inode **new_inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
@@ -851,7 +860,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (whiteout) {
+ if (is_whiteout) {
init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
inode->i_op = &f2fs_special_inode_operations;
} else {
@@ -876,21 +885,25 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
f2fs_add_orphan_inode(inode);
f2fs_alloc_nid_done(sbi, inode->i_ino);
- if (whiteout) {
+ if (is_whiteout) {
f2fs_i_links_write(inode, false);
spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
-
- *whiteout = inode;
} else {
- d_tmpfile(dentry, inode);
+ if (dentry)
+ d_tmpfile(dentry, inode);
+ else
+ f2fs_i_links_write(inode, false);
}
/* link_count was changed by d_tmpfile as well. */
f2fs_unlock_op(sbi);
unlock_new_inode(inode);
+ if (new_inode)
+ *new_inode = inode;
+
f2fs_balance_fs(sbi, true);
return 0;
@@ -911,7 +924,7 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL);
+ return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL);
}
static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
@@ -921,7 +934,13 @@ static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
return -EIO;
return __f2fs_tmpfile(mnt_userns, dir, NULL,
- S_IFCHR | WHITEOUT_MODE, whiteout);
+ S_IFCHR | WHITEOUT_MODE, true, whiteout);
+}
+
+int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct inode **new_inode)
+{
+ return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode);
}
static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c45d341dcf6e..e06a0c478b39 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
- } else if (type == INMEM_PAGES) {
- /* it allows 20% / total_ram for inmemory pages */
- mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
- res = mem_size < (val.totalram / 5);
} else if (type == DISCARD_CACHE) {
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
@@ -1296,7 +1292,11 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
dec_valid_node_count(sbi, dn->inode, !ofs);
goto fail;
}
- f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
+ if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
+ err = -EFSCORRUPTED;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ goto fail;
+ }
#endif
new_ni.nid = dn->nid;
new_ni.ino = dn->inode->i_ino;
@@ -1331,7 +1331,7 @@ fail:
* 0: f2fs_put_page(page, 0)
* LOCKED_PAGE or error: f2fs_put_page(page, 1)
*/
-static int read_node_page(struct page *page, int op_flags)
+static int read_node_page(struct page *page, blk_opf_t op_flags)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
@@ -1416,8 +1416,7 @@ repeat:
err = read_node_page(page, 0);
if (err < 0) {
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
+ goto out_put_err;
} else if (err == LOCKED_PAGE) {
err = 0;
goto page_hit;
@@ -1443,19 +1442,23 @@ repeat:
goto out_err;
}
page_hit:
- if (unlikely(nid != nid_of_node(page))) {
- f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ if (likely(nid == nid_of_node(page)))
+ return page;
+
+ f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- err = -EINVAL;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ err = -EINVAL;
out_err:
- ClearPageUptodate(page);
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
- }
- return page;
+ ClearPageUptodate(page);
+out_put_err:
+ /* ENOENT comes from read_node_page which is not an error. */
+ if (err != -ENOENT)
+ f2fs_handle_page_eio(sbi, page->index, NODE);
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
}
struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
@@ -1631,7 +1634,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
goto redirty_out;
}
- if (atomic && !test_opt(sbi, NOBARRIER))
+ if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
/* should add to global list before clearing PAGECACHE status */
@@ -1946,7 +1949,6 @@ next_step:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
bool submitted = false;
- bool may_dirty = true;
/* give a priority to WB_SYNC threads */
if (atomic_read(&sbi->wb_sync_req[NODE]) &&
@@ -1999,11 +2001,8 @@ continue_unlock:
}
/* flush dirty inode */
- if (IS_INODE(page) && may_dirty) {
- may_dirty = false;
- if (flush_dirty_inode(page))
- goto lock_node;
- }
+ if (IS_INODE(page) && flush_dirty_inode(page))
+ goto lock_node;
write_node:
f2fs_wait_on_page_writeback(page, NODE, true, true);
@@ -2165,10 +2164,8 @@ const struct address_space_operations f2fs_node_aops = {
.writepages = f2fs_write_node_pages,
.dirty_folio = f2fs_dirty_node_folio,
.invalidate_folio = f2fs_invalidate_folio,
- .releasepage = f2fs_release_page,
-#ifdef CONFIG_MIGRATION
- .migratepage = f2fs_migrate_page,
-#endif
+ .release_folio = f2fs_release_folio,
+ .migrate_folio = filemap_migrate_folio,
};
static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 4c1d34bfea78..3c09cae058b0 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -147,7 +147,6 @@ enum mem_type {
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
EXTENT_CACHE, /* indicates extent cache */
- INMEM_PAGES, /* indicates inmemory pages */
DISCARD_CACHE, /* indicates memory of cached discard cmds */
COMPRESS_PAGE, /* indicates memory of cached compressed pages */
BASE_CHECK, /* check kernel status */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 3cb7f8a43b4d..dcd0a1e35095 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page)
memset(&attr, 0, sizeof(attr));
- attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid);
- attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid);
+ attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid));
+ attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid));
- if (!uid_eq(attr.ia_uid, inode->i_uid))
+ if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode)))
attr.ia_valid |= ATTR_UID;
- if (!gid_eq(attr.ia_gid, inode->i_gid))
+ if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode)))
attr.ia_valid |= ATTR_GID;
if (!attr.ia_valid)
return 0;
- err = dquot_transfer(inode, &attr);
+ err = dquot_transfer(&init_user_ns, inode, &attr);
if (err)
set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR);
return err;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 22dfeb991529..0de21f82d7bc 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -30,7 +30,7 @@
static struct kmem_cache *discard_entry_slab;
static struct kmem_cache *discard_cmd_slab;
static struct kmem_cache *sit_entry_set_slab;
-static struct kmem_cache *inmem_entry_slab;
+static struct kmem_cache *revoke_entry_slab;
static unsigned long __reverse_ulong(unsigned char *str)
{
@@ -185,301 +185,182 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
}
-void f2fs_register_inmem_page(struct inode *inode, struct page *page)
+void f2fs_abort_atomic_write(struct inode *inode, bool clean)
{
- struct inmem_pages *new;
-
- set_page_private_atomic(page);
-
- new = f2fs_kmem_cache_alloc(inmem_entry_slab,
- GFP_NOFS, true, NULL);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
- /* add atomic page indices to the list */
- new->page = page;
- INIT_LIST_HEAD(&new->list);
+ if (!f2fs_is_atomic_file(inode))
+ return;
- /* increase reference count with clean state */
- get_page(page);
- mutex_lock(&F2FS_I(inode)->inmem_lock);
- list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages);
- inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
- mutex_unlock(&F2FS_I(inode)->inmem_lock);
+ if (clean)
+ truncate_inode_pages_final(inode->i_mapping);
+ clear_inode_flag(fi->cow_inode, FI_COW_FILE);
+ iput(fi->cow_inode);
+ fi->cow_inode = NULL;
+ release_atomic_write_cnt(inode);
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
- trace_f2fs_register_inmem_page(page, INMEM);
+ spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+ sbi->atomic_files--;
+ spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
}
-static int __revoke_inmem_pages(struct inode *inode,
- struct list_head *head, bool drop, bool recover,
- bool trylock)
+static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
+ block_t new_addr, block_t *old_addr, bool recover)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inmem_pages *cur, *tmp;
- int err = 0;
-
- list_for_each_entry_safe(cur, tmp, head, list) {
- struct page *page = cur->page;
-
- if (drop)
- trace_f2fs_commit_inmem_page(page, INMEM_DROP);
-
- if (trylock) {
- /*
- * to avoid deadlock in between page lock and
- * inmem_lock.
- */
- if (!trylock_page(page))
- continue;
- } else {
- lock_page(page);
- }
-
- f2fs_wait_on_page_writeback(page, DATA, true, true);
-
- if (recover) {
- struct dnode_of_data dn;
- struct node_info ni;
+ struct dnode_of_data dn;
+ struct node_info ni;
+ int err;
- trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
retry:
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_get_dnode_of_data(&dn, page->index,
- LOOKUP_NODE);
- if (err) {
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_NOFS);
- goto retry;
- }
- err = -EAGAIN;
- goto next;
- }
-
- err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
- if (err) {
- f2fs_put_dnode(&dn);
- return err;
- }
-
- if (cur->old_addr == NEW_ADDR) {
- f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
- f2fs_update_data_blkaddr(&dn, NEW_ADDR);
- } else
- f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
- cur->old_addr, ni.version, true, true);
- f2fs_put_dnode(&dn);
- }
-next:
- /* we don't need to invalidate this in the sccessful status */
- if (drop || recover) {
- ClearPageUptodate(page);
- clear_page_private_gcing(page);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA);
+ if (err) {
+ if (err == -ENOMEM) {
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ goto retry;
}
- detach_page_private(page);
- set_page_private(page, 0);
- f2fs_put_page(page, 1);
-
- list_del(&cur->list);
- kmem_cache_free(inmem_entry_slab, cur);
- dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ return err;
}
- return err;
-}
-void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure)
-{
- struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
- struct inode *inode;
- struct f2fs_inode_info *fi;
- unsigned int count = sbi->atomic_files;
- unsigned int looped = 0;
-next:
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (list_empty(head)) {
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- return;
+ err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ return err;
}
- fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
- inode = igrab(&fi->vfs_inode);
- if (inode)
- list_move_tail(&fi->inmem_ilist, head);
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
- if (inode) {
- if (gc_failure) {
- if (!fi->i_gc_failures[GC_FAILURE_ATOMIC])
- goto skip;
+ if (recover) {
+ /* dn.data_blkaddr is always valid */
+ if (!__is_valid_data_blkaddr(new_addr)) {
+ if (new_addr == NULL_ADDR)
+ dec_valid_block_count(sbi, inode, 1);
+ f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
+ f2fs_update_data_blkaddr(&dn, new_addr);
+ } else {
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ new_addr, ni.version, true, true);
}
- set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- f2fs_drop_inmem_pages(inode);
-skip:
- iput(inode);
- }
- f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
- if (gc_failure) {
- if (++looped >= count)
- return;
- }
- goto next;
-}
-
-void f2fs_drop_inmem_pages(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_inode_info *fi = F2FS_I(inode);
+ } else {
+ blkcnt_t count = 1;
- do {
- mutex_lock(&fi->inmem_lock);
- if (list_empty(&fi->inmem_pages)) {
- fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
-
- spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
- if (!list_empty(&fi->inmem_ilist))
- list_del_init(&fi->inmem_ilist);
- if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(inode, FI_ATOMIC_FILE);
- sbi->atomic_files--;
- }
- spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+ *old_addr = dn.data_blkaddr;
+ f2fs_truncate_data_blocks_range(&dn, 1);
+ dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count);
+ inc_valid_block_count(sbi, inode, &count);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
+ ni.version, true, false);
+ }
- mutex_unlock(&fi->inmem_lock);
- break;
- }
- __revoke_inmem_pages(inode, &fi->inmem_pages,
- true, false, true);
- mutex_unlock(&fi->inmem_lock);
- } while (1);
+ f2fs_put_dnode(&dn);
+ return 0;
}
-void f2fs_drop_inmem_page(struct inode *inode, struct page *page)
+static void __complete_revoke_list(struct inode *inode, struct list_head *head,
+ bool revoke)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct list_head *head = &fi->inmem_pages;
- struct inmem_pages *cur = NULL;
-
- f2fs_bug_on(sbi, !page_private_atomic(page));
+ struct revoke_entry *cur, *tmp;
- mutex_lock(&fi->inmem_lock);
- list_for_each_entry(cur, head, list) {
- if (cur->page == page)
- break;
+ list_for_each_entry_safe(cur, tmp, head, list) {
+ if (revoke)
+ __replace_atomic_write_block(inode, cur->index,
+ cur->old_addr, NULL, true);
+ list_del(&cur->list);
+ kmem_cache_free(revoke_entry_slab, cur);
}
-
- f2fs_bug_on(sbi, list_empty(head) || cur->page != page);
- list_del(&cur->list);
- mutex_unlock(&fi->inmem_lock);
-
- dec_page_count(sbi, F2FS_INMEM_PAGES);
- kmem_cache_free(inmem_entry_slab, cur);
-
- ClearPageUptodate(page);
- clear_page_private_atomic(page);
- f2fs_put_page(page, 0);
-
- detach_page_private(page);
- set_page_private(page, 0);
-
- trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
}
-static int __f2fs_commit_inmem_pages(struct inode *inode)
+static int __f2fs_commit_atomic_write(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct inmem_pages *cur, *tmp;
- struct f2fs_io_info fio = {
- .sbi = sbi,
- .ino = inode->i_ino,
- .type = DATA,
- .op = REQ_OP_WRITE,
- .op_flags = REQ_SYNC | REQ_PRIO,
- .io_type = FS_DATA_IO,
- };
+ struct inode *cow_inode = fi->cow_inode;
+ struct revoke_entry *new;
struct list_head revoke_list;
- bool submit_bio = false;
- int err = 0;
+ block_t blkaddr;
+ struct dnode_of_data dn;
+ pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ pgoff_t off = 0, blen, index;
+ int ret = 0, i;
INIT_LIST_HEAD(&revoke_list);
- list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
- struct page *page = cur->page;
+ while (len) {
+ blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len);
- lock_page(page);
- if (page->mapping == inode->i_mapping) {
- trace_f2fs_commit_inmem_page(page, INMEM);
+ set_new_dnode(&dn, cow_inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ ret = 0;
+ if (dn.max_level == 0)
+ goto out;
+ goto next;
+ }
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode),
+ len);
+ index = off;
+ for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) {
+ blkaddr = f2fs_data_blkaddr(&dn);
- set_page_dirty(page);
- if (clear_page_dirty_for_io(page)) {
- inode_dec_dirty_pages(inode);
- f2fs_remove_dirty_inode(inode);
+ if (!__is_valid_data_blkaddr(blkaddr)) {
+ continue;
+ } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
+ DATA_GENERIC_ENHANCE)) {
+ f2fs_put_dnode(&dn);
+ ret = -EFSCORRUPTED;
+ goto out;
}
-retry:
- fio.page = page;
- fio.old_blkaddr = NULL_ADDR;
- fio.encrypted_page = NULL;
- fio.need_lock = LOCK_DONE;
- err = f2fs_do_write_data_page(&fio);
- if (err) {
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_NOFS);
- goto retry;
- }
- unlock_page(page);
- break;
+
+ new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS,
+ true, NULL);
+
+ ret = __replace_atomic_write_block(inode, index, blkaddr,
+ &new->old_addr, false);
+ if (ret) {
+ f2fs_put_dnode(&dn);
+ kmem_cache_free(revoke_entry_slab, new);
+ goto out;
}
- /* record old blkaddr for revoking */
- cur->old_addr = fio.old_blkaddr;
- submit_bio = true;
+
+ f2fs_update_data_blkaddr(&dn, NULL_ADDR);
+ new->index = index;
+ list_add_tail(&new->list, &revoke_list);
}
- unlock_page(page);
- list_move_tail(&cur->list, &revoke_list);
+ f2fs_put_dnode(&dn);
+next:
+ off += blen;
+ len -= blen;
}
- if (submit_bio)
- f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA);
-
- if (err) {
- /*
- * try to revoke all committed pages, but still we could fail
- * due to no memory or other reason, if that happened, EAGAIN
- * will be returned, which means in such case, transaction is
- * already not integrity, caller should use journal to do the
- * recovery or rewrite & commit last transaction. For other
- * error number, revoking was done by filesystem itself.
- */
- err = __revoke_inmem_pages(inode, &revoke_list,
- false, true, false);
+out:
+ if (ret)
+ sbi->revoked_atomic_block += fi->atomic_write_cnt;
+ else
+ sbi->committed_atomic_block += fi->atomic_write_cnt;
- /* drop all uncommitted pages */
- __revoke_inmem_pages(inode, &fi->inmem_pages,
- true, false, false);
- } else {
- __revoke_inmem_pages(inode, &revoke_list,
- false, false, false);
- }
+ __complete_revoke_list(inode, &revoke_list, ret ? true : false);
- return err;
+ return ret;
}
-int f2fs_commit_inmem_pages(struct inode *inode)
+int f2fs_commit_atomic_write(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
int err;
- f2fs_balance_fs(sbi, true);
+ err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (err)
+ return err;
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
-
f2fs_lock_op(sbi);
- set_inode_flag(inode, FI_ATOMIC_COMMIT);
-
- mutex_lock(&fi->inmem_lock);
- err = __f2fs_commit_inmem_pages(inode);
- mutex_unlock(&fi->inmem_lock);
- clear_inode_flag(inode, FI_ATOMIC_COMMIT);
+ err = __f2fs_commit_atomic_write(inode);
f2fs_unlock_op(sbi);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
@@ -520,8 +401,15 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
io_schedule();
finish_wait(&sbi->gc_thread->fggc_wq, &wait);
} else {
+ struct f2fs_gc_control gc_control = {
+ .victim_segno = NULL_SEGNO,
+ .init_gc_type = BG_GC,
+ .no_bg_gc = true,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = false,
+ .nr_free_secs = 1 };
f2fs_down_write(&sbi->gc_lock);
- f2fs_gc(sbi, false, false, false, NULL_SEGNO);
+ f2fs_gc(sbi, &gc_control);
}
}
}
@@ -847,7 +735,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
get_valid_blocks(sbi, segno, true);
f2fs_bug_on(sbi, unlikely(!valid_blocks ||
- valid_blocks == BLKS_PER_SEC(sbi)));
+ valid_blocks == CAP_BLKS_PER_SEC(sbi)));
if (!IS_CURSEC(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
@@ -883,7 +771,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
if (!valid_blocks ||
- valid_blocks == BLKS_PER_SEC(sbi)) {
+ valid_blocks == CAP_BLKS_PER_SEC(sbi)) {
clear_bit(secno, dirty_i->dirty_secmap);
return;
}
@@ -1196,13 +1084,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
unsigned int *issued)
{
struct block_device *bdev = dc->bdev;
- struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_discard_blocks =
- SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
&(dcc->fstrim_list) : &(dcc->wait_list);
- int flag = dpolicy->sync ? REQ_SYNC : 0;
+ blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0;
block_t lstart, start, len, total_len;
int err = 0;
@@ -1245,7 +1132,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
err = __blkdev_issue_discard(bdev,
SECTOR_FROM_BLOCK(start),
SECTOR_FROM_BLOCK(len),
- GFP_NOFS, 0, &bio);
+ GFP_NOFS, &bio);
submit:
if (err) {
spin_lock_irqsave(&dc->lock, flags);
@@ -1375,9 +1262,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
struct discard_cmd *dc;
struct discard_info di = {0};
struct rb_node **insert_p = NULL, *insert_parent = NULL;
- struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_discard_blocks =
- SECTOR_TO_BLOCK(q->limits.max_discard_sectors);
+ SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
block_t end = lstart + len;
dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root,
@@ -1666,33 +1552,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
&(dcc->fstrim_list) : &(dcc->wait_list);
- struct discard_cmd *dc, *tmp;
- bool need_wait;
+ struct discard_cmd *dc = NULL, *iter, *tmp;
unsigned int trimmed = 0;
next:
- need_wait = false;
+ dc = NULL;
mutex_lock(&dcc->cmd_lock);
- list_for_each_entry_safe(dc, tmp, wait_list, list) {
- if (dc->lstart + dc->len <= start || end <= dc->lstart)
+ list_for_each_entry_safe(iter, tmp, wait_list, list) {
+ if (iter->lstart + iter->len <= start || end <= iter->lstart)
continue;
- if (dc->len < dpolicy->granularity)
+ if (iter->len < dpolicy->granularity)
continue;
- if (dc->state == D_DONE && !dc->ref) {
- wait_for_completion_io(&dc->wait);
- if (!dc->error)
- trimmed += dc->len;
- __remove_discard_cmd(sbi, dc);
+ if (iter->state == D_DONE && !iter->ref) {
+ wait_for_completion_io(&iter->wait);
+ if (!iter->error)
+ trimmed += iter->len;
+ __remove_discard_cmd(sbi, iter);
} else {
- dc->ref++;
- need_wait = true;
+ iter->ref++;
+ dc = iter;
break;
}
}
mutex_unlock(&dcc->cmd_lock);
- if (need_wait) {
+ if (dc) {
trimmed += __wait_one_discard_bio(sbi, dc);
goto next;
}
@@ -3243,101 +3128,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
}
}
-/* This returns write hints for each segment type. This hints will be
- * passed down to block layer. There are mapping tables which depend on
- * the mount option 'whint_mode'.
- *
- * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
- *
- * 2) whint_mode=user-based. F2FS tries to pass down hints given by users.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_NOT_SET
- * HOT_NODE "
- * WARM_NODE "
- * COLD_NODE "
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- *
- * 3) whint_mode=fs-based. F2FS passes down hints with its policy.
- *
- * User F2FS Block
- * ---- ---- -----
- * META WRITE_LIFE_MEDIUM;
- * HOT_NODE WRITE_LIFE_NOT_SET
- * WARM_NODE "
- * COLD_NODE WRITE_LIFE_NONE
- * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
- * extension list " "
- *
- * -- buffered io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
- * WRITE_LIFE_NONE " "
- * WRITE_LIFE_MEDIUM " "
- * WRITE_LIFE_LONG " "
- *
- * -- direct io
- * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
- * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
- * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
- * WRITE_LIFE_NONE " WRITE_LIFE_NONE
- * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
- * WRITE_LIFE_LONG " WRITE_LIFE_LONG
- */
-
-enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
- enum page_type type, enum temp_type temp)
-{
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_NOT_SET;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else {
- return WRITE_LIFE_NOT_SET;
- }
- } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) {
- if (type == DATA) {
- if (temp == WARM)
- return WRITE_LIFE_LONG;
- else if (temp == HOT)
- return WRITE_LIFE_SHORT;
- else if (temp == COLD)
- return WRITE_LIFE_EXTREME;
- } else if (type == NODE) {
- if (temp == WARM || temp == HOT)
- return WRITE_LIFE_NOT_SET;
- else if (temp == COLD)
- return WRITE_LIFE_NONE;
- } else if (type == META) {
- return WRITE_LIFE_MEDIUM;
- }
- }
- return WRITE_LIFE_NOT_SET;
-}
-
static int __get_segment_type_2(struct f2fs_io_info *fio)
{
if (fio->type == DATA)
@@ -3383,8 +3173,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
return CURSEG_COLD_DATA;
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
- f2fs_is_atomic_file(inode) ||
- f2fs_is_volatile_file(inode))
+ f2fs_is_cow_file(inode))
return CURSEG_HOT_DATA;
return f2fs_rw_hint_to_seg_type(inode->i_write_hint);
} else {
@@ -3651,7 +3440,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
goto drop_bio;
}
- invalidate_mapping_pages(META_MAPPING(sbi),
+ if (fio->post_read)
+ invalidate_mapping_pages(META_MAPPING(sbi),
fio->new_blkaddr, fio->new_blkaddr);
stat_inc_inplace_blocks(fio->sbi);
@@ -3834,10 +3624,16 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
block_t len)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
block_t i;
+ if (!f2fs_post_read_required(inode))
+ return;
+
for (i = 0; i < len; i++)
f2fs_wait_on_block_writeback(inode, blkaddr + i);
+
+ invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1);
}
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
@@ -4181,10 +3977,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses,
return;
list_for_each_entry_continue(next, head, set_list)
- if (ses->entry_cnt <= next->entry_cnt)
- break;
+ if (ses->entry_cnt <= next->entry_cnt) {
+ list_move_tail(&ses->set_list, &next->set_list);
+ return;
+ }
- list_move_tail(&ses->set_list, &next->set_list);
+ list_move_tail(&ses->set_list, head);
}
static void add_sit_entry(unsigned int segno, struct list_head *head)
@@ -4552,7 +4350,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
int err = 0;
- block_t total_node_blocks = 0;
+ block_t sit_valid_blocks[2] = {0, 0};
do {
readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS,
@@ -4577,8 +4375,14 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
if (err)
return err;
seg_info_from_raw_sit(se, &sit);
- if (IS_NODESEG(se->type))
- total_node_blocks += se->valid_blocks;
+
+ if (se->type >= NR_PERSISTENT_LOG) {
+ f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
+ se->type, start);
+ return -EFSCORRUPTED;
+ }
+
+ sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
if (f2fs_block_unit_discard(sbi)) {
/* build discard map only one time */
@@ -4618,15 +4422,22 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
sit = sit_in_journal(journal, i);
old_valid_blocks = se->valid_blocks;
- if (IS_NODESEG(se->type))
- total_node_blocks -= old_valid_blocks;
+
+ sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks;
err = check_block_count(sbi, start, &sit);
if (err)
break;
seg_info_from_raw_sit(se, &sit);
- if (IS_NODESEG(se->type))
- total_node_blocks += se->valid_blocks;
+
+ if (se->type >= NR_PERSISTENT_LOG) {
+ f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
+ se->type, start);
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
if (f2fs_block_unit_discard(sbi)) {
if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
@@ -4648,13 +4459,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
}
up_read(&curseg->journal_rwsem);
- if (!err && total_node_blocks != valid_node_count(sbi)) {
+ if (err)
+ return err;
+
+ if (sit_valid_blocks[NODE] != valid_node_count(sbi)) {
f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
- total_node_blocks, valid_node_count(sbi));
- err = -EFSCORRUPTED;
+ sit_valid_blocks[NODE], valid_node_count(sbi));
+ return -EFSCORRUPTED;
}
- return err;
+ if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] >
+ valid_user_blocks(sbi)) {
+ f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u",
+ sit_valid_blocks[DATA], sit_valid_blocks[NODE],
+ valid_user_blocks(sbi));
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -4688,7 +4510,6 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno = 0, offset = 0, secno;
block_t valid_blocks, usable_blks_in_seg;
- block_t blks_per_sec = BLKS_PER_SEC(sbi);
while (1) {
/* find dirty segment based on free segmap */
@@ -4717,7 +4538,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
valid_blocks = get_valid_blocks(sbi, segno, true);
secno = GET_SEC_FROM_SEG(sbi, segno);
- if (!valid_blocks || valid_blocks == blks_per_sec)
+ if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi))
continue;
if (IS_CURSEC(sbi, secno))
continue;
@@ -4734,6 +4555,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
+
+ dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
+ if (!dirty_i->pinned_secmap)
+ return -ENOMEM;
+
+ dirty_i->pinned_secmap_cnt = 0;
+ dirty_i->enable_pin_section = true;
return 0;
}
@@ -5093,7 +4921,7 @@ static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno,
static inline unsigned int f2fs_usable_zone_segs_in_sec(
struct f2fs_sb_info *sbi, unsigned int segno)
{
- unsigned int dev_idx, zone_idx, unusable_segs_in_sec;
+ unsigned int dev_idx, zone_idx;
dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno));
zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx);
@@ -5102,18 +4930,12 @@ static inline unsigned int f2fs_usable_zone_segs_in_sec(
if (is_conv_zone(sbi, zone_idx, dev_idx))
return sbi->segs_per_sec;
- /*
- * If the zone_capacity_blocks array is NULL, then zone capacity
- * is equal to the zone size for all zones
- */
- if (!FDEV(dev_idx).zone_capacity_blocks)
+ if (!sbi->unusable_blocks_per_sec)
return sbi->segs_per_sec;
/* Get the segment count beyond zone capacity block */
- unusable_segs_in_sec = (sbi->blocks_per_blkz -
- FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >>
- sbi->log_blocks_per_seg;
- return sbi->segs_per_sec - unusable_segs_in_sec;
+ return sbi->segs_per_sec - (sbi->unusable_blocks_per_sec >>
+ sbi->log_blocks_per_seg);
}
/*
@@ -5142,12 +4964,11 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
if (is_conv_zone(sbi, zone_idx, dev_idx))
return sbi->blocks_per_seg;
- if (!FDEV(dev_idx).zone_capacity_blocks)
+ if (!sbi->unusable_blocks_per_sec)
return sbi->blocks_per_seg;
sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno));
- sec_cap_blkaddr = sec_start_blkaddr +
- FDEV(dev_idx).zone_capacity_blocks[zone_idx];
+ sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi);
/*
* If segment starts before zone capacity and spans beyond
@@ -5322,6 +5143,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ kvfree(dirty_i->pinned_secmap);
kvfree(dirty_i->victim_secmap);
}
@@ -5432,9 +5254,9 @@ int __init f2fs_create_segment_manager_caches(void)
if (!sit_entry_set_slab)
goto destroy_discard_cmd;
- inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry",
- sizeof(struct inmem_pages));
- if (!inmem_entry_slab)
+ revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry",
+ sizeof(struct revoke_entry));
+ if (!revoke_entry_slab)
goto destroy_sit_entry_set;
return 0;
@@ -5453,5 +5275,5 @@ void f2fs_destroy_segment_manager_caches(void)
kmem_cache_destroy(sit_entry_set_slab);
kmem_cache_destroy(discard_cmd_slab);
kmem_cache_destroy(discard_entry_slab);
- kmem_cache_destroy(inmem_entry_slab);
+ kmem_cache_destroy(revoke_entry_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5c94caf0c0a1..d1d63766f2c7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -24,6 +24,7 @@
#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE)
+#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA))
static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
unsigned short seg_type)
@@ -100,6 +101,9 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
#define BLKS_PER_SEC(sbi) \
((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define CAP_BLKS_PER_SEC(sbi) \
+ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \
+ (sbi)->unusable_blocks_per_sec)
#define GET_SEC_FROM_SEG(sbi, segno) \
(((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec)
#define GET_SEG_FROM_SEC(sbi, secno) \
@@ -224,10 +228,10 @@ struct segment_allocation {
#define MAX_SKIP_GC_COUNT 16
-struct inmem_pages {
+struct revoke_entry {
struct list_head list;
- struct page *page;
block_t old_addr; /* for revoking when fail to commit */
+ pgoff_t index;
};
struct sit_info {
@@ -294,6 +298,9 @@ struct dirty_seglist_info {
struct mutex seglist_lock; /* lock for segment bitmaps */
int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
unsigned long *victim_secmap; /* background GC victims */
+ unsigned long *pinned_secmap; /* pinned victims from foreground GC */
+ unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */
+ bool enable_pin_section; /* enable pinning section */
};
/* victim selection function for cleaning and SSR */
@@ -572,11 +579,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
}
-static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
+ unsigned int node_blocks, unsigned int dent_blocks)
{
- unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
- get_pages(sbi, F2FS_DIRTY_DENTS);
- unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+
unsigned int segno, left_blocks;
int i;
@@ -602,19 +608,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
int freed, int needed)
{
- int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
- int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
- int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+ unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+ get_pages(sbi, F2FS_DIRTY_DENTS) +
+ get_pages(sbi, F2FS_DIRTY_IMETA);
+ unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi);
+ unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi);
+ unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi);
+ unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi);
+ unsigned int free, need_lower, need_upper;
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
- if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
- has_curseg_enough_space(sbi))
+ free = free_sections(sbi) + freed;
+ need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed;
+ need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0);
+
+ if (free > need_upper)
return false;
- return (free_sections(sbi) + freed) <=
- (node_secs + 2 * dent_secs + imeta_secs +
- reserved_sections(sbi) + needed);
+ else if (free <= need_lower)
+ return true;
+ return !has_curseg_enough_space(sbi, node_blocks, dent_blocks);
}
static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ea939db18f88..26817b5aeac7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h>
#include <linux/buffer_head.h>
@@ -138,7 +139,6 @@ enum {
Opt_jqfmt_vfsold,
Opt_jqfmt_vfsv0,
Opt_jqfmt_vfsv1,
- Opt_whint,
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
@@ -160,6 +160,7 @@ enum {
Opt_gc_merge,
Opt_nogc_merge,
Opt_discard_unit,
+ Opt_memory_mode,
Opt_err,
};
@@ -214,7 +215,6 @@ static match_table_t f2fs_tokens = {
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_whint, "whint_mode=%s"},
{Opt_alloc, "alloc_mode=%s"},
{Opt_fsync, "fsync_mode=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
@@ -237,6 +237,7 @@ static match_table_t f2fs_tokens = {
{Opt_gc_merge, "gc_merge"},
{Opt_nogc_merge, "nogc_merge"},
{Opt_discard_unit, "discard_unit=%s"},
+ {Opt_memory_mode, "memory=%s"},
{Opt_err, NULL},
};
@@ -494,9 +495,19 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
bool is_remount)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
-#ifdef CONFIG_FS_ENCRYPTION
+ struct fs_parameter param = {
+ .type = fs_value_is_string,
+ .string = arg->from ? arg->from : "",
+ };
+ struct fscrypt_dummy_policy *policy =
+ &F2FS_OPTION(sbi).dummy_enc_policy;
int err;
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ f2fs_warn(sbi, "test_dummy_encryption option not supported");
+ return -EINVAL;
+ }
+
if (!f2fs_sb_has_encrypt(sbi)) {
f2fs_err(sbi, "Encrypt feature is off");
return -EINVAL;
@@ -508,12 +519,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) {
+ if (is_remount && !fscrypt_is_dummy_policy_set(policy)) {
f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
return -EINVAL;
}
- err = fscrypt_set_test_dummy_encryption(
- sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy);
+
+ err = fscrypt_parse_test_dummy_encryption(&param, policy);
if (err) {
if (err == -EEXIST)
f2fs_warn(sbi,
@@ -526,10 +537,13 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
opt, err);
return -EINVAL;
}
+ err = fscrypt_add_test_dummy_key(sb, policy);
+ if (err) {
+ f2fs_warn(sbi, "Error adding test dummy encryption key [%d]",
+ err);
+ return err;
+ }
f2fs_warn(sbi, "Test dummy encryption mode enabled");
-#else
- f2fs_warn(sbi, "Test dummy encryption mount option ignored");
-#endif
return 0;
}
@@ -975,22 +989,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
f2fs_info(sbi, "quota operations not supported");
break;
#endif
- case Opt_whint:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "user-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER;
- } else if (!strcmp(name, "off")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
- } else if (!strcmp(name, "fs-based")) {
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
case Opt_alloc:
name = match_strdup(&args[0]);
if (!name)
@@ -1239,6 +1237,22 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_memory_mode:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (!strcmp(name, "normal")) {
+ F2FS_OPTION(sbi).memory_mode =
+ MEMORY_MODE_NORMAL;
+ } else if (!strcmp(name, "low")) {
+ F2FS_OPTION(sbi).memory_mode =
+ MEMORY_MODE_LOW;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1328,12 +1342,6 @@ default_check:
return -EINVAL;
}
- /* Not pass down write hints if the number of active logs is lesser
- * than NR_CURSEG_PERSIST_TYPE.
- */
- if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE)
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
-
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1363,9 +1371,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
- INIT_LIST_HEAD(&fi->inmem_ilist);
- INIT_LIST_HEAD(&fi->inmem_pages);
- mutex_init(&fi->inmem_lock);
init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
init_f2fs_rwsem(&fi->i_xattr_sem);
@@ -1406,9 +1411,7 @@ static int f2fs_drop_inode(struct inode *inode)
atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
- /* some remained atomic pages should discarded */
- if (f2fs_is_atomic_file(inode))
- f2fs_drop_inmem_pages(inode);
+ f2fs_abort_atomic_write(inode, true);
/* should remain fi->extent_tree for writepage */
f2fs_destroy_extent_node(inode);
@@ -1518,7 +1521,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
blkdev_put(FDEV(i).bdev, FMODE_EXCL);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
- kfree(FDEV(i).zone_capacity_blocks);
#endif
}
kvfree(sbi->devs);
@@ -1731,18 +1733,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
block_t total_count, user_block_count, start_count;
u64 avail_node_count;
+ unsigned int total_valid_node_count;
total_count = le64_to_cpu(sbi->raw_super->block_count);
- user_block_count = sbi->user_block_count;
start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
buf->f_type = F2FS_SUPER_MAGIC;
buf->f_bsize = sbi->blocksize;
buf->f_blocks = total_count - start_count;
+
+ spin_lock(&sbi->stat_lock);
+
+ user_block_count = sbi->user_block_count;
+ total_valid_node_count = valid_node_count(sbi);
+ avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
buf->f_bfree = user_block_count - valid_user_blocks(sbi) -
sbi->current_reserved_blocks;
- spin_lock(&sbi->stat_lock);
if (unlikely(buf->f_bfree <= sbi->unusable_block_count))
buf->f_bfree = 0;
else
@@ -1755,14 +1762,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
else
buf->f_bavail = 0;
- avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
-
if (avail_node_count > user_block_count) {
buf->f_files = user_block_count;
buf->f_ffree = buf->f_bavail;
} else {
buf->f_files = avail_node_count;
- buf->f_ffree = min(avail_node_count - valid_node_count(sbi),
+ buf->f_ffree = min(avail_node_count - total_valid_node_count,
buf->f_bavail);
}
@@ -1978,10 +1983,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",prjquota");
#endif
f2fs_show_quota_options(seq, sbi->sb);
- if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER)
- seq_printf(seq, ",whint_mode=%s", "user-based");
- else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS)
- seq_printf(seq, ",whint_mode=%s", "fs-based");
fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb);
@@ -2021,6 +2022,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
seq_printf(seq, ",discard_unit=%s", "section");
+ if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL)
+ seq_printf(seq, ",memory=%s", "normal");
+ else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW)
+ seq_printf(seq, ",memory=%s", "low");
+
return 0;
}
@@ -2033,7 +2039,6 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF;
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
@@ -2043,6 +2048,7 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).compress_ext_cnt = 0;
F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS;
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
+ F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL;
sbi->sb->s_flags &= ~SB_INLINECRYPT;
@@ -2084,7 +2090,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
- unsigned int gc_mode;
+ unsigned int gc_mode = sbi->gc_mode;
int err = 0;
int ret;
block_t unusable;
@@ -2095,14 +2101,25 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
}
sbi->sb->s_flags |= SB_ACTIVE;
+ /* check if we need more GC first */
+ unusable = f2fs_get_unusable_blocks(sbi);
+ if (!f2fs_disable_cp_again(sbi, unusable))
+ goto skip_gc;
+
f2fs_update_time(sbi, DISABLE_TIME);
- gc_mode = sbi->gc_mode;
sbi->gc_mode = GC_URGENT_HIGH;
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
+ struct f2fs_gc_control gc_control = {
+ .victim_segno = NULL_SEGNO,
+ .init_gc_type = FG_GC,
+ .should_migrate_blocks = false,
+ .err_gc_skipped = true,
+ .nr_free_secs = 1 };
+
f2fs_down_write(&sbi->gc_lock);
- err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
+ err = f2fs_gc(sbi, &gc_control);
if (err == -ENODATA) {
err = 0;
break;
@@ -2123,6 +2140,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
goto restore_flag;
}
+skip_gc:
f2fs_down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
@@ -2314,8 +2332,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & SB_RDONLY ||
- F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
+ if (*flags & SB_RDONLY) {
sync_inodes_sb(sb);
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -2513,7 +2530,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
tocopy = min_t(unsigned long, sb->s_blocksize - offset,
towrite);
retry:
- err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
+ err = a_ops->write_begin(NULL, mapping, off, tocopy,
&page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
@@ -2714,7 +2731,8 @@ int f2fs_quota_sync(struct super_block *sb, int type)
if (!sb_has_quota_active(sb, cnt))
continue;
- inode_lock(dqopt->files[cnt]);
+ if (!f2fs_sb_has_quota_ino(sbi))
+ inode_lock(dqopt->files[cnt]);
/*
* do_quotactl
@@ -2733,7 +2751,8 @@ int f2fs_quota_sync(struct super_block *sb, int type)
f2fs_up_read(&sbi->quota_sem);
f2fs_unlock_op(sbi);
- inode_unlock(dqopt->files[cnt]);
+ if (!f2fs_sb_has_quota_ino(sbi))
+ inode_unlock(dqopt->files[cnt]);
if (ret)
break;
@@ -3020,23 +3039,24 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
*lblk_bits_ret = 8 * sizeof(block_t);
}
-static int f2fs_get_num_devices(struct super_block *sb)
+static struct block_device **f2fs_get_devices(struct super_block *sb,
+ unsigned int *num_devs)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct block_device **devs;
+ int i;
- if (f2fs_is_multi_device(sbi))
- return sbi->s_ndevs;
- return 1;
-}
+ if (!f2fs_is_multi_device(sbi))
+ return NULL;
-static void f2fs_get_devices(struct super_block *sb,
- struct request_queue **devs)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- int i;
+ devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL);
+ if (!devs)
+ return ERR_PTR(-ENOMEM);
for (i = 0; i < sbi->s_ndevs; i++)
- devs[i] = bdev_get_queue(FDEV(i).bdev);
+ devs[i] = FDEV(i).bdev;
+ *num_devs = sbi->s_ndevs;
+ return devs;
}
static const struct fscrypt_operations f2fs_cryptops = {
@@ -3047,7 +3067,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
.empty_dir = f2fs_empty_dir,
.has_stable_inodes = f2fs_has_stable_inodes,
.get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
- .get_num_devices = f2fs_get_num_devices,
.get_devices = f2fs_get_devices,
};
#endif
@@ -3595,6 +3614,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
spin_lock_init(&sbi->gc_urgent_high_lock);
+ atomic64_set(&sbi->current_atomic_write, 0);
sbi->dir_level = DEF_DIR_LEVEL;
sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
@@ -3652,24 +3672,29 @@ err_valid_block:
#ifdef CONFIG_BLK_DEV_ZONED
struct f2fs_report_zones_args {
+ struct f2fs_sb_info *sbi;
struct f2fs_dev_info *dev;
- bool zone_cap_mismatch;
};
static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
struct f2fs_report_zones_args *rz_args = data;
+ block_t unusable_blocks = (zone->len - zone->capacity) >>
+ F2FS_LOG_SECTORS_PER_BLOCK;
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
return 0;
set_bit(idx, rz_args->dev->blkz_seq);
- rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >>
- F2FS_LOG_SECTORS_PER_BLOCK;
- if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch)
- rz_args->zone_cap_mismatch = true;
-
+ if (!rz_args->sbi->unusable_blocks_per_sec) {
+ rz_args->sbi->unusable_blocks_per_sec = unusable_blocks;
+ return 0;
+ }
+ if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) {
+ f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -3678,22 +3703,29 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
struct block_device *bdev = FDEV(devi).bdev;
sector_t nr_sectors = bdev_nr_sectors(bdev);
struct f2fs_report_zones_args rep_zone_arg;
+ u64 zone_sectors;
int ret;
if (!f2fs_sb_has_blkzoned(sbi))
return 0;
+ zone_sectors = bdev_zone_sectors(bdev);
+ if (!is_power_of_2(zone_sectors)) {
+ f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n");
+ return -EINVAL;
+ }
+
if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
- SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)))
+ SECTOR_TO_BLOCK(zone_sectors))
return -EINVAL;
- sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev));
+ sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors);
if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
__ilog2_u32(sbi->blocks_per_blkz))
return -EINVAL;
sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
sbi->log_blocks_per_blkz;
- if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
+ if (nr_sectors & (zone_sectors - 1))
FDEV(devi).nr_blkz++;
FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi,
@@ -3703,26 +3735,13 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
if (!FDEV(devi).blkz_seq)
return -ENOMEM;
- /* Get block zones type and zone-capacity */
- FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi,
- FDEV(devi).nr_blkz * sizeof(block_t),
- GFP_KERNEL);
- if (!FDEV(devi).zone_capacity_blocks)
- return -ENOMEM;
-
+ rep_zone_arg.sbi = sbi;
rep_zone_arg.dev = &FDEV(devi);
- rep_zone_arg.zone_cap_mismatch = false;
ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb,
&rep_zone_arg);
if (ret < 0)
return ret;
-
- if (!rep_zone_arg.zone_cap_mismatch) {
- kfree(FDEV(devi).zone_capacity_blocks);
- FDEV(devi).zone_capacity_blocks = NULL;
- }
-
return 0;
}
#endif
@@ -4100,30 +4119,9 @@ try_onemore:
set_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
- for (i = 0; i < NR_PAGE_TYPE; i++) {
- int n = (i == META) ? 1 : NR_TEMP_TYPE;
- int j;
-
- sbi->write_io[i] =
- f2fs_kmalloc(sbi,
- array_size(n,
- sizeof(struct f2fs_bio_info)),
- GFP_KERNEL);
- if (!sbi->write_io[i]) {
- err = -ENOMEM;
- goto free_bio_info;
- }
-
- for (j = HOT; j < n; j++) {
- init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
- sbi->write_io[i][j].sbi = sbi;
- sbi->write_io[i][j].bio = NULL;
- spin_lock_init(&sbi->write_io[i][j].io_lock);
- INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
- INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
- }
- }
+ err = f2fs_init_write_merge_io(sbi);
+ if (err)
+ goto free_bio_info;
init_f2fs_rwsem(&sbi->cp_rwsem);
init_f2fs_rwsem(&sbi->quota_sem);
@@ -4609,7 +4607,7 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_sysfs();
if (err)
goto free_garbage_collection_cache;
- err = register_shrinker(&f2fs_shrinker_info);
+ err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker");
if (err)
goto free_sysfs;
err = register_filesystem(&f2fs_fs_type);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 4c50aedd5144..eba5fb1629d7 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -339,6 +339,21 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
sbi->gc_reclaimed_segs[sbi->gc_segment_mode]);
}
+ if (!strcmp(a->attr.name, "current_atomic_write")) {
+ s64 current_write = atomic64_read(&sbi->current_atomic_write);
+
+ return sysfs_emit(buf, "%lld\n", current_write);
+ }
+
+ if (!strcmp(a->attr.name, "peak_atomic_write"))
+ return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write);
+
+ if (!strcmp(a->attr.name, "committed_atomic_block"))
+ return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block);
+
+ if (!strcmp(a->attr.name, "revoked_atomic_block"))
+ return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block);
+
ui = (unsigned int *)(ptr + a->offset);
return sprintf(buf, "%u\n", *ui);
@@ -608,6 +623,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "peak_atomic_write")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->peak_atomic_write = 0;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "committed_atomic_block")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->committed_atomic_block = 0;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "revoked_atomic_block")) {
+ if (t != 0)
+ return -EINVAL;
+ sbi->revoked_atomic_block = 0;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -713,6 +749,11 @@ static struct f2fs_attr f2fs_attr_##_name = { \
.offset = _offset \
}
+#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \
+ F2FS_ATTR_OFFSET(struct_type, name, 0444, \
+ f2fs_sbi_show, NULL, \
+ offsetof(struct struct_name, elname))
+
#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \
F2FS_ATTR_OFFSET(struct_type, name, 0644, \
f2fs_sbi_show, f2fs_sbi_store, \
@@ -811,6 +852,8 @@ F2FS_FEATURE_RO_ATTR(encrypted_casefold);
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_FEATURE_RO_ATTR(block_zoned);
+F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec,
+ unusable_blocks_per_sec);
#endif
F2FS_FEATURE_RO_ATTR(atomic_write);
F2FS_FEATURE_RO_ATTR(extra_attr);
@@ -848,6 +891,12 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole);
+/* For atomic write */
+F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block);
+
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_urgent_sleep_time),
@@ -919,6 +968,9 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(moved_blocks_background),
ATTR_LIST(avg_vblocks),
#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ATTR_LIST(unusable_blocks_per_sec),
+#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compr_written_block),
ATTR_LIST(compr_saved_block),
@@ -934,6 +986,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_reclaimed_segments),
ATTR_LIST(max_fragment_chunk),
ATTR_LIST(max_fragment_hole),
+ ATTR_LIST(current_atomic_write),
+ ATTR_LIST(peak_atomic_write),
+ ATTR_LIST(committed_atomic_block),
+ ATTR_LIST(revoked_atomic_block),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 3d793202cc9f..7b8f2b41c29b 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -74,6 +74,9 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
static int pagecache_write(struct inode *inode, const void *buf, size_t count,
loff_t pos)
{
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+
if (pos + count > inode->i_sb->s_maxbytes)
return -EFBIG;
@@ -85,8 +88,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
void *addr;
int res;
- res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
- &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
if (res)
return res;
@@ -94,8 +96,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
memcpy(addr + offset_in_page(pos), buf, n);
kunmap_atomic(addr);
- res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
- page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
if (res < 0)
return res;
if (res != n)
@@ -128,7 +129,7 @@ static int f2fs_begin_enable_verity(struct file *filp)
if (f2fs_verity_in_progress(inode))
return -EBUSY;
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ if (f2fs_is_atomic_file(inode))
return -EOPNOTSUPP;
/*
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 249825017da7..00235b8a1823 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -705,7 +705,7 @@ static int fat_readdir(struct file *file, struct dir_context *ctx)
}
#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
-static int func(struct dir_context *ctx, const char *name, int name_len, \
+static bool func(struct dir_context *ctx, const char *name, int name_len, \
loff_t offset, u64 ino, unsigned int d_type) \
{ \
struct fat_ioctl_filldir_callback *buf = \
@@ -714,7 +714,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
struct dirent_type __user *d2 = d1 + 1; \
\
if (buf->result) \
- return -EINVAL; \
+ return false; \
buf->result++; \
\
if (name != NULL) { \
@@ -750,10 +750,10 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
put_user(short_len, &d1->d_reclen)) \
goto efault; \
} \
- return 0; \
+ return true; \
efault: \
buf->result = -EFAULT; \
- return -EFAULT; \
+ return false; \
}
FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 02d4d4234956..a415c02ede39 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -126,6 +126,7 @@ struct msdos_inode_info {
struct hlist_node i_fat_hash; /* hash by i_location */
struct hlist_node i_dir_hash; /* hash by i_logstart */
struct rw_semaphore truncate_lock; /* protect bmap against truncate */
+ struct timespec64 i_crtime; /* File creation (birth) time */
struct inode vfs_inode;
};
@@ -433,8 +434,15 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
__fat_fs_error(sb, 1, fmt , ## args)
#define fat_fs_error_ratelimit(sb, fmt, args...) \
__fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+
+#define FAT_PRINTK_PREFIX "%sFAT-fs (%s): "
+#define fat_msg(sb, level, fmt, args...) \
+do { \
+ printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\
+ _fat_msg(sb, level, fmt, ##args); \
+} while (0)
__printf(3, 4) __cold
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
#define fat_msg_ratelimit(sb, level, fmt, args...) \
do { \
if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \
@@ -446,6 +454,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 __time, __le16 __date, u8 time_cs);
extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
__le16 *time, __le16 *date, u8 *time_cs);
+extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts);
+extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts);
extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
int flags);
extern int fat_update_time(struct inode *inode, struct timespec64 *now,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 978ac6751aeb..1db348f8f887 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -94,7 +94,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
err_brelse:
brelse(bhs[0]);
err:
- fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
+ fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+ (llu)blocknr);
return -EIO;
}
@@ -107,8 +108,8 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
fatent->bhs[0] = sb_bread(sb, blocknr);
if (!fatent->bhs[0]) {
- fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
- (llu)blocknr);
+ fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+ (llu)blocknr);
return -EIO;
}
fatent->nr_bhs = 1;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a5a309fcc7fa..8a6b493b5b5f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
* out the RO attribute for checking by the security
* module, just because it maps to a file mode.
*/
- err = security_inode_setattr(file->f_path.dentry, &ia);
+ err = security_inode_setattr(file_mnt_user_ns(file),
+ file->f_path.dentry, &ia);
if (err)
goto out_unlock_inode;
@@ -127,13 +128,12 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
struct super_block *sb = inode->i_sb;
struct fstrim_range __user *user_range;
struct fstrim_range range;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
user_range = (struct fstrim_range __user *)arg;
@@ -141,7 +141,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
err = fat_trim_fs(inode, &range);
if (err < 0)
@@ -399,13 +399,21 @@ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+
generic_fillattr(mnt_userns, inode, stat);
- stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
+ stat->blksize = sbi->cluster_size;
- if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+ if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) {
/* Use i_pos for ino. This is used as fileid of nfs. */
- stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode);
+ stat->ino = fat_i_pos_read(sbi, inode);
}
+
+ if (sbi->options.isvfat && request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime = MSDOS_I(inode)->i_crtime;
+ }
+
return 0;
}
EXPORT_SYMBOL_GPL(fat_getattr);
@@ -453,8 +461,9 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns,
{
umode_t allow_utime = sbi->options.allow_utime;
- if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
- if (in_group_p(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
+ current_fsuid())) {
+ if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)))
allow_utime >>= 3;
if (allow_utime & MAY_WRITE)
return 1;
@@ -509,9 +518,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
if (((attr->ia_valid & ATTR_UID) &&
- (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
+ (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid),
+ sbi->options.fs_uid))) ||
((attr->ia_valid & ATTR_GID) &&
- (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
+ (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid),
+ sbi->options.fs_gid))) ||
((attr->ia_valid & ATTR_MODE) &&
(attr->ia_mode & ~FAT_VALID_MODE)))
error = -EPERM;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index bf6051bdf1d1..a38238d75c08 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -205,9 +205,9 @@ static int fat_writepages(struct address_space *mapping,
return mpage_writepages(mapping, wbc, fat_get_block);
}
-static int fat_readpage(struct file *file, struct page *page)
+static int fat_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, fat_get_block);
+ return mpage_read_folio(folio, fat_get_block);
}
static void fat_readahead(struct readahead_control *rac)
@@ -226,13 +226,13 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
}
static int fat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int err;
*pagep = NULL;
- err = cont_write_begin(file, mapping, pos, len, flags,
+ err = cont_write_begin(file, mapping, pos, len,
pagep, fsdata, fat_get_block,
&MSDOS_I(mapping->host)->mmu_private);
if (err < 0)
@@ -344,7 +344,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
static const struct address_space_operations fat_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = fat_readpage,
+ .read_folio = fat_read_folio,
.readahead = fat_readahead,
.writepage = fat_writepage,
.writepages = fat_writepages,
@@ -567,12 +567,13 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
+ inode->i_ctime = inode->i_mtime;
if (sbi->options.isvfat) {
- fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
- de->cdate, de->ctime_cs);
fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
+ fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
+ de->cdate, de->ctime_cs);
} else
- fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME);
+ inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime);
return 0;
}
@@ -757,6 +758,8 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
ei->i_logstart = 0;
ei->i_attrs = 0;
ei->i_pos = 0;
+ ei->i_crtime.tv_sec = 0;
+ ei->i_crtime.tv_nsec = 0;
return &ei->vfs_inode;
}
@@ -888,10 +891,10 @@ retry:
&raw_entry->date, NULL);
if (sbi->options.isvfat) {
__le16 atime;
- fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
- &raw_entry->cdate, &raw_entry->ctime_cs);
fat_time_unix2fat(sbi, &inode->i_atime, &atime,
&raw_entry->adate, NULL);
+ fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime,
+ &raw_entry->cdate, &raw_entry->ctime_cs);
}
spin_unlock(&sbi->inode_hash_lock);
mark_buffer_dirty(bh);
@@ -1872,13 +1875,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
goto out_fail;
}
- if (sbi->options.discard) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- fat_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
+ if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev))
+ fat_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but the device does not support discard");
fat_set_state(sb, 1, 0);
return 0;
@@ -1889,10 +1888,8 @@ out_invalid:
fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
out_fail:
- if (fsinfo_inode)
- iput(fsinfo_inode);
- if (fat_inode)
- iput(fat_inode);
+ iput(fsinfo_inode);
+ iput(fat_inode);
unload_nls(sbi->nls_io);
unload_nls(sbi->nls_disk);
fat_reset_iocharset(&sbi->options);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 91ca3c304211..7e5d6ae305f2 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -42,10 +42,16 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
EXPORT_SYMBOL_GPL(__fat_fs_error);
/**
- * fat_msg() - print preformated FAT specific messages. Every thing what is
- * not fat_fs_error() should be fat_msg().
+ * _fat_msg() - Print a preformatted FAT message based on a superblock.
+ * @sb: A pointer to a &struct super_block
+ * @level: A Kernel printk level constant
+ * @fmt: The printf-style format string to print.
+ *
+ * Everything that is not fat_fs_error() should be fat_msg().
+ *
+ * fat_msg() wraps _fat_msg() for printk indexing.
*/
-void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -53,7 +59,7 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf);
va_end(args);
}
@@ -187,7 +193,7 @@ static long days_in_year[] = {
0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
};
-static inline int fat_tz_offset(struct msdos_sb_info *sbi)
+static inline int fat_tz_offset(const struct msdos_sb_info *sbi)
{
return (sbi->options.tz_set ?
-sbi->options.time_offset :
@@ -275,23 +281,35 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts)
return (struct timespec64){ ts.tv_sec & ~1ULL, 0 };
}
-static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts)
+/*
+ * truncate atime to 24 hour granularity (00:00:00 in local timezone)
+ */
+struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts)
+{
+ /* to localtime */
+ time64_t seconds = ts->tv_sec - fat_tz_offset(sbi);
+ s32 remainder;
+
+ div_s64_rem(seconds, SECS_PER_DAY, &remainder);
+ /* to day boundary, and back to unix time */
+ seconds = seconds + fat_tz_offset(sbi) - remainder;
+
+ return (struct timespec64){ seconds, 0 };
+}
+
+/*
+ * truncate mtime to 2 second granularity
+ */
+struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
+ const struct timespec64 *ts)
{
- if (ts.tv_nsec)
- ts.tv_nsec -= ts.tv_nsec % 10000000UL;
- return ts;
+ return fat_timespec64_trunc_2secs(*ts);
}
/*
* truncate the various times with appropriate granularity:
- * root inode:
- * all times always 0
- * all other inodes:
- * mtime - 2 seconds
- * ctime
- * msdos - 2 seconds
- * vfat - 10 milliseconds
- * atime - 24 hours (00:00:00 in local timezone)
+ * all times in root node are always 0
*/
int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
{
@@ -306,25 +324,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
ts = current_time(inode);
}
- if (flags & S_ATIME) {
- /* to localtime */
- time64_t seconds = now->tv_sec - fat_tz_offset(sbi);
- s32 remainder;
-
- div_s64_rem(seconds, SECS_PER_DAY, &remainder);
- /* to day boundary, and back to unix time */
- seconds = seconds + fat_tz_offset(sbi) - remainder;
-
- inode->i_atime = (struct timespec64){ seconds, 0 };
- }
- if (flags & S_CTIME) {
- if (sbi->options.isvfat)
- inode->i_ctime = fat_timespec64_trunc_10ms(*now);
- else
- inode->i_ctime = fat_timespec64_trunc_2secs(*now);
- }
+ if (flags & S_ATIME)
+ inode->i_atime = fat_truncate_atime(sbi, now);
+ /*
+ * ctime and mtime share the same on-disk field, and should be
+ * identical in memory. all mtime updates will be applied to ctime,
+ * but ctime updates are ignored.
+ */
if (flags & S_MTIME)
- inode->i_mtime = fat_timespec64_trunc_2secs(*now);
+ inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now);
return 0;
}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 5369d82e0bfb..21620054e1c4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -780,8 +780,6 @@ static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir,
goto out;
}
inode_inc_iversion(inode);
- fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
- /* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
out:
@@ -878,8 +876,6 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
inode_inc_iversion(inode);
set_nlink(inode, 2);
- fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME);
- /* timestamp is already written, so mark_inode_dirty() is unneeded. */
d_instantiate(dentry, inode);
@@ -893,22 +889,57 @@ out:
return err;
}
-static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
- struct dentry *old_dentry, struct inode *new_dir,
- struct dentry *new_dentry, unsigned int flags)
+static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
+ struct msdos_dir_entry **de)
+{
+ if (S_ISDIR(inode->i_mode)) {
+ if (fat_get_dotdot_entry(inode, bh, de))
+ return -EIO;
+ }
+ return 0;
+}
+
+static int vfat_sync_ipos(struct inode *dir, struct inode *inode)
+{
+ if (IS_DIRSYNC(dir))
+ return fat_sync_inode(inode);
+ mark_inode_dirty(inode);
+ return 0;
+}
+
+static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *dotdot_bh,
+ struct msdos_dir_entry *dotdot_de)
+{
+ fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart);
+ mark_buffer_dirty_inode(dotdot_bh, inode);
+ if (IS_DIRSYNC(dir))
+ return sync_dirty_buffer(dotdot_bh);
+ return 0;
+}
+
+static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts)
+{
+ inode_inc_iversion(dir);
+ fat_truncate_time(dir, ts, S_CTIME | S_MTIME);
+ if (IS_DIRSYNC(dir))
+ (void)fat_sync_inode(dir);
+ else
+ mark_inode_dirty(dir);
+}
+
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
{
struct buffer_head *dotdot_bh;
- struct msdos_dir_entry *dotdot_de;
+ struct msdos_dir_entry *dotdot_de = NULL;
struct inode *old_inode, *new_inode;
struct fat_slot_info old_sinfo, sinfo;
struct timespec64 ts;
loff_t new_i_pos;
- int err, is_dir, update_dotdot, corrupt = 0;
+ int err, is_dir, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = d_inode(old_dentry);
new_inode = d_inode(new_dentry);
@@ -917,15 +948,13 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
if (err)
goto out;
- is_dir = S_ISDIR(old_inode->i_mode);
- update_dotdot = (is_dir && old_dir != new_dir);
- if (update_dotdot) {
- if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de)) {
- err = -EIO;
+ if (old_dir != new_dir) {
+ err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de);
+ if (err)
goto out;
- }
}
+ is_dir = S_ISDIR(old_inode->i_mode);
ts = current_time(old_dir);
if (new_inode) {
if (is_dir) {
@@ -946,21 +975,15 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
fat_detach(old_inode);
fat_attach(old_inode, new_i_pos);
- if (IS_DIRSYNC(new_dir)) {
- err = fat_sync_inode(old_inode);
- if (err)
- goto error_inode;
- } else
- mark_inode_dirty(old_inode);
+ err = vfat_sync_ipos(new_dir, old_inode);
+ if (err)
+ goto error_inode;
- if (update_dotdot) {
- fat_set_start(dotdot_de, MSDOS_I(new_dir)->i_logstart);
- mark_buffer_dirty_inode(dotdot_bh, old_inode);
- if (IS_DIRSYNC(new_dir)) {
- err = sync_dirty_buffer(dotdot_bh);
- if (err)
- goto error_dotdot;
- }
+ if (dotdot_de) {
+ err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh,
+ dotdot_de);
+ if (err)
+ goto error_dotdot;
drop_nlink(old_dir);
if (!new_inode)
inc_nlink(new_dir);
@@ -970,12 +993,7 @@ static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
old_sinfo.bh = NULL;
if (err)
goto error_dotdot;
- inode_inc_iversion(old_dir);
- fat_truncate_time(old_dir, &ts, S_CTIME|S_MTIME);
- if (IS_DIRSYNC(old_dir))
- (void)fat_sync_inode(old_dir);
- else
- mark_inode_dirty(old_dir);
+ vfat_update_dir_metadata(old_dir, &ts);
if (new_inode) {
drop_nlink(new_inode);
@@ -995,10 +1013,9 @@ error_dotdot:
/* data cluster is shared, serious corruption */
corrupt = 1;
- if (update_dotdot) {
- fat_set_start(dotdot_de, MSDOS_I(old_dir)->i_logstart);
- mark_buffer_dirty_inode(dotdot_bh, old_inode);
- corrupt |= sync_dirty_buffer(dotdot_bh);
+ if (dotdot_de) {
+ corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh,
+ dotdot_de);
}
error_inode:
fat_detach(old_inode);
@@ -1025,13 +1042,145 @@ error_inode:
goto out;
}
+static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode,
+ loff_t old_i_pos, loff_t new_i_pos)
+{
+ fat_detach(old_inode);
+ fat_detach(new_inode);
+ fat_attach(old_inode, new_i_pos);
+ fat_attach(new_inode, old_i_pos);
+}
+
+static void vfat_move_nlink(struct inode *src, struct inode *dst)
+{
+ drop_nlink(src);
+ inc_nlink(dst);
+}
+
+static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL;
+ struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL;
+ struct inode *old_inode, *new_inode;
+ struct timespec64 ts = current_time(old_dir);
+ loff_t old_i_pos, new_i_pos;
+ int err, corrupt = 0;
+ struct super_block *sb = old_dir->i_sb;
+
+ old_inode = d_inode(old_dentry);
+ new_inode = d_inode(new_dentry);
+
+ /* Acquire super block lock for the operation to be atomic */
+ mutex_lock(&MSDOS_SB(sb)->s_lock);
+
+ /* if directories are not the same, get ".." info to update */
+ if (old_dir != new_dir) {
+ err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh,
+ &old_dotdot_de);
+ if (err)
+ goto out;
+
+ err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh,
+ &new_dotdot_de);
+ if (err)
+ goto out;
+ }
+
+ old_i_pos = MSDOS_I(old_inode)->i_pos;
+ new_i_pos = MSDOS_I(new_inode)->i_pos;
+
+ vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos);
+
+ err = vfat_sync_ipos(old_dir, new_inode);
+ if (err)
+ goto error_exchange;
+ err = vfat_sync_ipos(new_dir, old_inode);
+ if (err)
+ goto error_exchange;
+
+ /* update ".." directory entry info */
+ if (old_dotdot_de) {
+ err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh,
+ old_dotdot_de);
+ if (err)
+ goto error_old_dotdot;
+ }
+ if (new_dotdot_de) {
+ err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh,
+ new_dotdot_de);
+ if (err)
+ goto error_new_dotdot;
+ }
+
+ /* if cross directory and only one is a directory, adjust nlink */
+ if (!old_dotdot_de != !new_dotdot_de) {
+ if (old_dotdot_de)
+ vfat_move_nlink(old_dir, new_dir);
+ else
+ vfat_move_nlink(new_dir, old_dir);
+ }
+
+ vfat_update_dir_metadata(old_dir, &ts);
+ /* if directories are not the same, update new_dir as well */
+ if (old_dir != new_dir)
+ vfat_update_dir_metadata(new_dir, &ts);
+
+out:
+ brelse(old_dotdot_bh);
+ brelse(new_dotdot_bh);
+ mutex_unlock(&MSDOS_SB(sb)->s_lock);
+
+ return err;
+
+error_new_dotdot:
+ if (new_dotdot_de) {
+ corrupt |= vfat_update_dotdot_de(new_dir, new_inode,
+ new_dotdot_bh, new_dotdot_de);
+ }
+
+error_old_dotdot:
+ if (old_dotdot_de) {
+ corrupt |= vfat_update_dotdot_de(old_dir, old_inode,
+ old_dotdot_bh, old_dotdot_de);
+ }
+
+error_exchange:
+ vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos);
+ corrupt |= vfat_sync_ipos(new_dir, new_inode);
+ corrupt |= vfat_sync_ipos(old_dir, old_inode);
+
+ if (corrupt < 0) {
+ fat_fs_error(new_dir->i_sb,
+ "%s: Filesystem corrupted (i_pos %lld, %lld)",
+ __func__, old_i_pos, new_i_pos);
+ }
+ goto out;
+}
+
+static int vfat_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
+{
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE) {
+ return vfat_rename_exchange(old_dir, old_dentry,
+ new_dir, new_dentry);
+ }
+
+ /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */
+ return vfat_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static const struct inode_operations vfat_dir_inode_operations = {
.create = vfat_create,
.lookup = vfat_lookup,
.unlink = vfat_unlink,
.mkdir = vfat_mkdir,
.rmdir = vfat_rmdir,
- .rename = vfat_rename,
+ .rename = vfat_rename2,
.setattr = fat_setattr,
.getattr = fat_getattr,
.update_time = fat_update_time,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f15d885b9796..146c9ab0cd4b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
arg |= O_NONBLOCK;
/* Pipe packetized mode is controlled by O_DIRECT flag */
- if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
- if (!filp->f_mapping || !filp->f_mapping->a_ops ||
- !filp->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if (!S_ISFIFO(inode->i_mode) &&
+ (arg & O_DIRECT) &&
+ !(filp->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (filp->f_op->check_flags)
error = filp->f_op->check_flags(arg);
@@ -79,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
}
spin_lock(&filp->f_lock);
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+ filp->f_iocb_flags = iocb_flags(filp);
spin_unlock(&filp->f_lock);
out:
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6630c69c23a2..f2bc27d1975e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -14,7 +14,7 @@
#include "internal.h"
#include "mount.h"
-static long do_sys_name_to_handle(struct path *path,
+static long do_sys_name_to_handle(const struct path *path,
struct file_handle __user *ufh,
int __user *mnt_id)
{
diff --git a/fs/file.c b/fs/file.c
index ee9317346702..3bcc1ecc314a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -630,32 +630,23 @@ EXPORT_SYMBOL(fd_install);
* @files: file struct to retrieve file from
* @fd: file descriptor to retrieve file for
*
- * If this functions returns an EINVAL error pointer the fd was beyond the
- * current maximum number of file descriptors for that fdtable.
+ * Context: files_lock must be held.
*
- * Returns: The file associated with @fd, on error returns an error pointer.
+ * Returns: The file associated with @fd (NULL if @fd is not open)
*/
static struct file *pick_file(struct files_struct *files, unsigned fd)
{
+ struct fdtable *fdt = files_fdtable(files);
struct file *file;
- struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds) {
- file = ERR_PTR(-EINVAL);
- goto out_unlock;
- }
+ if (fd >= fdt->max_fds)
+ return NULL;
+
file = fdt->fd[fd];
- if (!file) {
- file = ERR_PTR(-EBADF);
- goto out_unlock;
+ if (file) {
+ rcu_assign_pointer(fdt->fd[fd], NULL);
+ __put_unused_fd(files, fd);
}
- rcu_assign_pointer(fdt->fd[fd], NULL);
- __put_unused_fd(files, fd);
-
-out_unlock:
- spin_unlock(&files->file_lock);
return file;
}
@@ -664,8 +655,10 @@ int close_fd(unsigned fd)
struct files_struct *files = current->files;
struct file *file;
+ spin_lock(&files->file_lock);
file = pick_file(files, fd);
- if (IS_ERR(file))
+ spin_unlock(&files->file_lock);
+ if (!file)
return -EBADF;
return filp_close(file, files);
@@ -702,20 +695,25 @@ static inline void __range_cloexec(struct files_struct *cur_fds,
static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
unsigned int max_fd)
{
+ unsigned n;
+
+ rcu_read_lock();
+ n = last_fd(files_fdtable(cur_fds));
+ rcu_read_unlock();
+ max_fd = min(max_fd, n);
+
while (fd <= max_fd) {
struct file *file;
+ spin_lock(&cur_fds->file_lock);
file = pick_file(cur_fds, fd++);
- if (!IS_ERR(file)) {
+ spin_unlock(&cur_fds->file_lock);
+
+ if (file) {
/* found a valid file to close */
filp_close(file, cur_fds);
cond_resched();
- continue;
}
-
- /* beyond the last fd in that table */
- if (PTR_ERR(file) == -EINVAL)
- return;
}
}
@@ -795,43 +793,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
* See close_fd_get_file() below, this variant assumes current->files->file_lock
* is held.
*/
-int __close_fd_get_file(unsigned int fd, struct file **res)
+struct file *__close_fd_get_file(unsigned int fd)
{
- struct files_struct *files = current->files;
- struct file *file;
- struct fdtable *fdt;
-
- fdt = files_fdtable(files);
- if (fd >= fdt->max_fds)
- goto out_err;
- file = fdt->fd[fd];
- if (!file)
- goto out_err;
- rcu_assign_pointer(fdt->fd[fd], NULL);
- __put_unused_fd(files, fd);
- get_file(file);
- *res = file;
- return 0;
-out_err:
- *res = NULL;
- return -ENOENT;
+ return pick_file(current->files, fd);
}
/*
* variant of close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file, and then
- * an fput().
+ * The caller must ensure that filp_close() called on the file.
*/
-int close_fd_get_file(unsigned int fd, struct file **res)
+struct file *close_fd_get_file(unsigned int fd)
{
struct files_struct *files = current->files;
- int ret;
+ struct file *file;
spin_lock(&files->file_lock);
- ret = __close_fd_get_file(fd, res);
+ file = pick_file(files, fd);
spin_unlock(&files->file_lock);
- return ret;
+ return file;
}
void do_close_on_exec(struct files_struct *files)
@@ -871,7 +851,7 @@ void do_close_on_exec(struct files_struct *files)
}
static inline struct file *__fget_files_rcu(struct files_struct *files,
- unsigned int fd, fmode_t mask, unsigned int refs)
+ unsigned int fd, fmode_t mask)
{
for (;;) {
struct file *file;
@@ -897,10 +877,9 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* Such a race can take two forms:
*
* (a) the file ref already went down to zero,
- * and get_file_rcu_many() fails. Just try
- * again:
+ * and get_file_rcu() fails. Just try again:
*/
- if (unlikely(!get_file_rcu_many(file, refs)))
+ if (unlikely(!get_file_rcu(file)))
continue;
/*
@@ -909,11 +888,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* pointer having changed, because it always goes
* hand-in-hand with 'fdt'.
*
- * If so, we need to put our refs and try again.
+ * If so, we need to put our ref and try again.
*/
if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
unlikely(rcu_dereference_raw(*fdentry) != file)) {
- fput_many(file, refs);
+ fput(file);
continue;
}
@@ -926,37 +905,31 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
}
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
- fmode_t mask, unsigned int refs)
+ fmode_t mask)
{
struct file *file;
rcu_read_lock();
- file = __fget_files_rcu(files, fd, mask, refs);
+ file = __fget_files_rcu(files, fd, mask);
rcu_read_unlock();
return file;
}
-static inline struct file *__fget(unsigned int fd, fmode_t mask,
- unsigned int refs)
-{
- return __fget_files(current->files, fd, mask, refs);
-}
-
-struct file *fget_many(unsigned int fd, unsigned int refs)
+static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
- return __fget(fd, FMODE_PATH, refs);
+ return __fget_files(current->files, fd, mask);
}
struct file *fget(unsigned int fd)
{
- return __fget(fd, FMODE_PATH, 1);
+ return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
- return __fget(fd, 0, 1);
+ return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);
@@ -966,7 +939,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
task_lock(task);
if (task->files)
- file = __fget_files(task->files, fd, 0, 1);
+ file = __fget_files(task->files, fd, 0);
task_unlock(task);
return file;
@@ -1035,7 +1008,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask, 1);
+ file = __fget(fd, mask);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
diff --git a/fs/file_table.c b/fs/file_table.c
index 7d2e692b66a9..dd88701e54a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -45,7 +45,7 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp;
static void file_free_rcu(struct rcu_head *head)
{
- struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
+ struct file *f = container_of(head, struct file, f_rcuhead);
put_cred(f->f_cred);
kmem_cache_free(filp_cachep, f);
@@ -56,7 +56,7 @@ static inline void file_free(struct file *f)
security_file_free(f);
if (!(f->f_mode & FMODE_NOACCOUNT))
percpu_counter_dec(&nr_files);
- call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+ call_rcu(&f->f_rcuhead, file_free_rcu);
}
/*
@@ -142,7 +142,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
f->f_cred = get_cred(cred);
error = security_file_alloc(f);
if (unlikely(error)) {
- file_free_rcu(&f->f_u.fu_rcuhead);
+ file_free_rcu(&f->f_rcuhead);
return ERR_PTR(error);
}
@@ -235,12 +235,15 @@ static struct file *alloc_file(const struct path *path, int flags,
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
file->f_sb_err = file_sample_sb_err(file);
+ if (fop->llseek)
+ file->f_mode |= FMODE_LSEEK;
if ((file->f_mode & FMODE_READ) &&
likely(fop->read || fop->read_iter))
file->f_mode |= FMODE_CAN_READ;
if ((file->f_mode & FMODE_WRITE) &&
likely(fop->write || fop->write_iter))
file->f_mode |= FMODE_CAN_WRITE;
+ file->f_iocb_flags = iocb_flags(file);
file->f_mode |= FMODE_OPENED;
file->f_op = fop;
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
@@ -321,12 +324,7 @@ static void __fput(struct file *file)
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_dec(inode);
- if (mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(mnt);
- }
+ put_file_access(file);
dput(dentry);
if (unlikely(mode & FMODE_NEED_UNMOUNT))
dissolve_on_fput(mnt);
@@ -341,13 +339,13 @@ static void delayed_fput(struct work_struct *unused)
struct llist_node *node = llist_del_all(&delayed_fput_list);
struct file *f, *t;
- llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
+ llist_for_each_entry_safe(f, t, node, f_llist)
__fput(f);
}
static void ____fput(struct callback_head *work)
{
- __fput(container_of(work, struct file, f_u.fu_rcuhead));
+ __fput(container_of(work, struct file, f_rcuhead));
}
/*
@@ -368,14 +366,14 @@ EXPORT_SYMBOL_GPL(flush_delayed_fput);
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
-void fput_many(struct file *file, unsigned int refs)
+void fput(struct file *file)
{
- if (atomic_long_sub_and_test(refs, &file->f_count)) {
+ if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
- init_task_work(&file->f_u.fu_rcuhead, ____fput);
- if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
+ init_task_work(&file->f_rcuhead, ____fput);
+ if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
return;
/*
* After this task has run exit_task_work(),
@@ -384,16 +382,11 @@ void fput_many(struct file *file, unsigned int refs)
*/
}
- if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
+ if (llist_add(&file->f_llist, &delayed_fput_list))
schedule_delayed_work(&delayed_fput_work, 1);
}
}
-void fput(struct file *file)
-{
- fput_many(file, 1);
-}
-
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
@@ -412,6 +405,7 @@ void __fput_sync(struct file *file)
}
EXPORT_SYMBOL(fput);
+EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index a41ea0ba6943..bffd156d6434 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -1,32 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_SUPER_H_
#define _VXFS_SUPER_H_
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index 1fd41cf98b9f..de2a5bccb930 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index acc5477b3f23..fbcd603365ad 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -1,31 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_DIR_H_
#define _VXFS_DIR_H_
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index f5c428e21024..3a2180c5e208 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -1,31 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_EXTERN_H_
#define _VXFS_EXTERN_H_
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index a4610a77649e..c1174a3f8990 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h
index e026f0c49159..dfd2147599c4 100644
--- a/fs/freevxfs/vxfs_fshead.h
+++ b/fs/freevxfs/vxfs_fshead.h
@@ -1,32 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_FSHEAD_H_
#define _VXFS_FSHEAD_H_
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index bfc780c682fb..9b49ec36e667 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
@@ -37,45 +13,41 @@
#include "vxfs_extern.h"
#include "vxfs_inode.h"
-
-static int vxfs_immed_readpage(struct file *, struct page *);
-
-/*
- * Address space operations for immed files and directories.
- */
-const struct address_space_operations vxfs_immed_aops = {
- .readpage = vxfs_immed_readpage,
-};
-
/**
- * vxfs_immed_readpage - read part of an immed inode into pagecache
+ * vxfs_immed_read_folio - read part of an immed inode into pagecache
* @file: file context (unused)
- * @page: page frame to fill in.
+ * @folio: folio to fill in.
*
* Description:
- * vxfs_immed_readpage reads a part of the immed area of the
- * file that hosts @pp into the pagecache.
+ * vxfs_immed_read_folio reads a part of the immed area of the
+ * file that hosts @folio into the pagecache.
*
* Returns:
* Zero on success, else a negative error code.
*
* Locking status:
- * @page is locked and will be unlocked.
+ * @folio is locked and will be unlocked.
*/
-static int
-vxfs_immed_readpage(struct file *fp, struct page *pp)
+static int vxfs_immed_read_folio(struct file *fp, struct folio *folio)
{
- struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host);
- u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT;
- caddr_t kaddr;
+ struct vxfs_inode_info *vip = VXFS_INO(folio->mapping->host);
+ void *src = vip->vii_immed.vi_immed + folio_pos(folio);
+ unsigned long i;
+
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ memcpy_to_page(folio_page(folio, i), 0, src, PAGE_SIZE);
+ src += PAGE_SIZE;
+ }
- kaddr = kmap(pp);
- memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE);
- kunmap(pp);
-
- flush_dcache_page(pp);
- SetPageUptodate(pp);
- unlock_page(pp);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
+
+/*
+ * Address space operations for immed files and directories.
+ */
+const struct address_space_operations vxfs_immed_aops = {
+ .read_folio = vxfs_immed_read_folio,
+};
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 1f41b25ef38b..ceb6a12649ba 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h
index f012abed125d..1e9e138d2b33 100644
--- a/fs/freevxfs/vxfs_inode.h
+++ b/fs/freevxfs/vxfs_inode.h
@@ -1,32 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_INODE_H_
#define _VXFS_INODE_H_
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index a51425634f65..f04ba2ed1e1a 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c
index 813da6685151..23f35187c289 100644
--- a/fs/freevxfs/vxfs_olt.c
+++ b/fs/freevxfs/vxfs_olt.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h
index 0c0b0c9fa557..53afba08d617 100644
--- a/fs/freevxfs/vxfs_olt.h
+++ b/fs/freevxfs/vxfs_olt.h
@@ -1,31 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
*/
#ifndef _VXFS_OLT_H_
#define _VXFS_OLT_H_
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index e806694d4145..c99282df7761 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -1,30 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
@@ -38,11 +14,11 @@
#include "vxfs_extern.h"
-static int vxfs_readpage(struct file *, struct page *);
+static int vxfs_read_folio(struct file *, struct folio *);
static sector_t vxfs_bmap(struct address_space *, sector_t);
const struct address_space_operations vxfs_aops = {
- .readpage = vxfs_readpage,
+ .read_folio = vxfs_read_folio,
.bmap = vxfs_bmap,
};
@@ -75,15 +51,9 @@ vxfs_get_page(struct address_space *mapping, u_long n)
kmap(pp);
/** if (!PageChecked(pp)) **/
/** vxfs_check_page(pp); **/
- if (PageError(pp))
- goto fail;
}
return (pp);
-
-fail:
- vxfs_put_page(pp);
- return ERR_PTR(-EIO);
}
/**
@@ -141,24 +111,23 @@ vxfs_getblk(struct inode *ip, sector_t iblock,
}
/**
- * vxfs_readpage - read one page synchronously into the pagecache
+ * vxfs_read_folio - read one page synchronously into the pagecache
* @file: file context (unused)
- * @page: page frame to fill in.
+ * @folio: folio to fill in.
*
* Description:
- * The vxfs_readpage routine reads @page synchronously into the
+ * The vxfs_read_folio routine reads @folio synchronously into the
* pagecache.
*
* Returns:
* Zero on success, else a negative error code.
*
* Locking status:
- * @page is locked and will be unlocked.
+ * @folio is locked and will be unlocked.
*/
-static int
-vxfs_readpage(struct file *file, struct page *page)
+static int vxfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, vxfs_getblk);
+ return block_read_full_folio(folio, vxfs_getblk);
}
/**
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 22eed5a73ac2..c3b82f716f9a 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -1,31 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
* Copyright (c) 2016 Krzysztof Blaszkowski
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions, and the following disclaimer,
- * without modification.
- * 2. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL").
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
*/
/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591fe9cf1659..443f83382b9b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
list_move(&inode->i_io_list, head);
@@ -133,10 +134,10 @@ static bool inode_io_list_move_locked(struct inode *inode,
static void wb_wakeup(struct bdi_writeback *wb)
{
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
}
static void finish_writeback_work(struct bdi_writeback *wb,
@@ -163,7 +164,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
if (work->done)
atomic_inc(&work->done->cnt);
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (test_bit(WB_registered, &wb->state)) {
list_add_tail(&work->list, &wb->work_list);
@@ -171,7 +172,7 @@ static void wb_queue_work(struct bdi_writeback *wb,
} else
finish_writeback_work(wb, work);
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
}
/**
@@ -738,7 +739,7 @@ EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
* incorrectly attributed).
*
* To resolve this issue, cgroup writeback detects the majority dirtier of
- * an inode and transfers the ownership to it. To avoid unnnecessary
+ * an inode and transfers the ownership to it. To avoid unnecessary
* oscillation, the detection mechanism keeps track of history and gives
* out the switch verdict only if the foreign usage pattern is stable over
* a certain amount of time and/or writeback attempts.
@@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue,
inode = wb_inode(delaying_queue->prev);
if (inode_dirtied_after(inode, dirtied_before))
break;
+ spin_lock(&inode->i_lock);
list_move(&inode->i_io_list, &tmp);
moved++;
- spin_lock(&inode->i_lock);
inode->i_state |= I_SYNC_QUEUED;
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
@@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue,
goto out;
}
- /* Move inodes from one superblock together */
+ /*
+ * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
+ * we don't take inode->i_lock here because it is just a pointless overhead.
+ * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
+ * fully under our control.
+ */
while (!list_empty(&tmp)) {
sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
@@ -1712,6 +1718,15 @@ static int writeback_single_inode(struct inode *inode,
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
+ else if (!(inode->i_state & I_SYNC_QUEUED)) {
+ if ((inode->i_state & I_DIRTY))
+ redirty_tail_locked(inode, wb);
+ else if (inode->i_state & I_DIRTY_TIME) {
+ inode->dirtied_when = jiffies;
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+ }
+ }
+
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -1775,11 +1790,12 @@ static long writeback_sb_inodes(struct super_block *sb,
};
unsigned long start_time = jiffies;
long write_chunk;
- long wrote = 0; /* count both pages and inodes */
+ long total_wrote = 0; /* count both pages and inodes */
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
+ long wrote;
if (inode->i_sb != sb) {
if (work->sb) {
@@ -1821,8 +1837,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* We'll have another go at writing back this inode
* when we completed a full scan of b_io.
*/
- spin_unlock(&inode->i_lock);
requeue_io(inode, wb);
+ spin_unlock(&inode->i_lock);
trace_writeback_sb_inodes_requeue(inode);
continue;
}
@@ -1855,7 +1871,9 @@ static long writeback_sb_inodes(struct super_block *sb,
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
- wrote += write_chunk - wbc.nr_to_write;
+ wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
+ wrote = wrote < 0 ? 0 : wrote;
+ total_wrote += wrote;
if (need_resched()) {
/*
@@ -1877,7 +1895,7 @@ static long writeback_sb_inodes(struct super_block *sb,
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
- wrote++;
+ total_wrote++;
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
@@ -1891,14 +1909,14 @@ static long writeback_sb_inodes(struct super_block *sb,
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
*/
- if (wrote) {
+ if (total_wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
- return wrote;
+ return total_wrote;
}
static long __writeback_inodes_wb(struct bdi_writeback *wb,
@@ -2069,13 +2087,13 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
struct wb_writeback_work *work = NULL;
- spin_lock_bh(&wb->work_lock);
+ spin_lock_irq(&wb->work_lock);
if (!list_empty(&wb->work_list)) {
work = list_entry(wb->work_list.next,
struct wb_writeback_work, list);
list_del_init(&work->list);
}
- spin_unlock_bh(&wb->work_lock);
+ spin_unlock_irq(&wb->work_lock);
return work;
}
@@ -2351,11 +2369,26 @@ void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
int dirtytime = 0;
+ struct bdi_writeback *wb = NULL;
trace_writeback_mark_inode_dirty(inode, flags);
if (flags & I_DIRTY_INODE) {
/*
+ * Inode timestamp update will piggback on this dirtying.
+ * We tell ->dirty_inode callback that timestamps need to
+ * be updated by setting I_DIRTY_TIME in flags.
+ */
+ if (inode->i_state & I_DIRTY_TIME) {
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & I_DIRTY_TIME) {
+ inode->i_state &= ~I_DIRTY_TIME;
+ flags |= I_DIRTY_TIME;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
+ /*
* Notify the filesystem about the inode being dirtied, so that
* (if needed) it can update on-disk fields and journal the
* inode. This is only needed when the inode itself is being
@@ -2364,7 +2397,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
+ sb->s_op->dirty_inode(inode,
+ flags & (I_DIRTY_INODE | I_DIRTY_TIME));
trace_writeback_dirty_inode(inode, flags);
/* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
@@ -2385,31 +2419,36 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
smp_mb();
- if (((inode->i_state & flags) == flags) ||
- (dirtytime && (inode->i_state & I_DIRTY_INODE)))
+ if ((inode->i_state & flags) == flags)
return;
spin_lock(&inode->i_lock);
- if (dirtytime && (inode->i_state & I_DIRTY_INODE))
- goto out_unlock_inode;
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
inode_attach_wb(inode, NULL);
- /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
- if (flags & I_DIRTY_INODE)
- inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
/*
+ * Grab inode's wb early because it requires dropping i_lock and we
+ * need to make sure following checks happen atomically with dirty
+ * list handling so that we don't move inodes under flush worker's
+ * hands.
+ */
+ if (!was_dirty) {
+ wb = locked_inode_to_wb_and_lock_list(inode);
+ spin_lock(&inode->i_lock);
+ }
+
+ /*
* If the inode is queued for writeback by flush worker, just
* update its dirty state. Once the flush worker is done with
* the inode it will place it on the appropriate superblock
* list, based upon its state.
*/
if (inode->i_state & I_SYNC_QUEUED)
- goto out_unlock_inode;
+ goto out_unlock;
/*
* Only add valid (hashed) inodes to the superblock's
@@ -2417,22 +2456,19 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (!S_ISBLK(inode->i_mode)) {
if (inode_unhashed(inode))
- goto out_unlock_inode;
+ goto out_unlock;
}
if (inode->i_state & I_FREEING)
- goto out_unlock_inode;
+ goto out_unlock;
/*
* If the inode was already on b_dirty/b_io/b_more_io, don't
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
- struct bdi_writeback *wb;
struct list_head *dirty_list;
bool wakeup_bdi = false;
- wb = locked_inode_to_wb_and_lock_list(inode);
-
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies;
@@ -2446,6 +2482,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
dirty_list);
spin_unlock(&wb->list_lock);
+ spin_unlock(&inode->i_lock);
trace_writeback_dirty_inode_enqueue(inode);
/*
@@ -2460,7 +2497,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
return;
}
}
-out_unlock_inode:
+out_unlock:
+ if (wb)
+ spin_unlock(&wb->list_lock);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 76316c4a3fb7..b313a978ae0a 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -38,6 +38,3 @@ config FSCACHE_DEBUG
enabled by setting bits in /sys/modules/fscache/parameter/debug.
See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_OLD_API
- bool
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 2749933852a9..d645f8b302a2 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -214,7 +214,7 @@ void fscache_relinquish_cache(struct fscache_cache *cache)
cache->ops = NULL;
cache->cache_priv = NULL;
- smp_store_release(&cache->state, FSCACHE_CACHE_IS_NOT_PRESENT);
+ fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT);
fscache_put_cache(cache, where);
}
EXPORT_SYMBOL(fscache_relinquish_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 9bb1ab5fe5ed..451d8a077e12 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -30,7 +30,7 @@ static DEFINE_SPINLOCK(fscache_cookie_lru_lock);
DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out);
static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker);
static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD";
-unsigned int fscache_lru_cookie_timeout = 10 * HZ;
+static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
{
@@ -263,6 +263,8 @@ void fscache_caching_failed(struct fscache_cookie *cookie)
{
clear_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags);
fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_FAILED);
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ fscache_cookie_failed);
}
EXPORT_SYMBOL(fscache_caching_failed);
@@ -372,17 +374,22 @@ nomem:
return NULL;
}
+static inline bool fscache_cookie_is_dropped(struct fscache_cookie *cookie)
+{
+ return READ_ONCE(cookie->state) == FSCACHE_COOKIE_STATE_DROPPED;
+}
+
static void fscache_wait_on_collision(struct fscache_cookie *candidate,
struct fscache_cookie *wait_for)
{
enum fscache_cookie_state *statep = &wait_for->state;
- wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED,
+ wait_var_event_timeout(statep, fscache_cookie_is_dropped(wait_for),
20 * HZ);
- if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) {
+ if (!fscache_cookie_is_dropped(wait_for)) {
pr_notice("Potential collision c=%08x old: c=%08x",
candidate->debug_id, wait_for->debug_id);
- wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED);
+ wait_var_event(statep, fscache_cookie_is_dropped(wait_for));
}
}
@@ -517,7 +524,14 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie)
}
fscache_see_cookie(cookie, fscache_cookie_see_active);
- fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+ spin_lock(&cookie->lock);
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_INVALIDATING);
+ else
+ __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+ spin_unlock(&cookie->lock);
+ wake_up_cookie_state(cookie);
trace = fscache_access_lookup_cookie_end;
out:
@@ -727,6 +741,9 @@ again_locked:
fallthrough;
case FSCACHE_COOKIE_STATE_FAILED:
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end);
+
if (atomic_read(&cookie->n_accesses) != 0)
break;
if (test_bit(FSCACHE_COOKIE_DO_RELINQUISH, &cookie->flags)) {
@@ -752,6 +769,9 @@ again_locked:
spin_lock(&cookie->lock);
}
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end);
+
switch (state) {
case FSCACHE_COOKIE_STATE_RELINQUISHING:
fscache_see_cookie(cookie, fscache_cookie_see_relinquish);
@@ -1048,6 +1068,9 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
return;
case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ if (!test_and_set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie);
+ fallthrough;
case FSCACHE_COOKIE_STATE_CREATING:
spin_unlock(&cookie->lock);
_leave(" [look %x]", cookie->inval_counter);
@@ -1069,6 +1092,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
}
EXPORT_SYMBOL(__fscache_invalidate);
+#ifdef CONFIG_PROC_FS
/*
* Generate a list of extant cookies in /proc/fs/fscache/cookies
*/
@@ -1145,3 +1169,4 @@ const struct seq_operations fscache_cookies_seq_ops = {
.stop = fscache_cookies_seq_stop,
.show = fscache_cookies_seq_show,
};
+#endif
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ed1c9ed737f2..1336f517e9b1 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -56,7 +56,9 @@ static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
* cookie.c
*/
extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
extern struct timer_list fscache_cookie_lru_timer;
extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
@@ -137,7 +139,9 @@ int fscache_stats_show(struct seq_file *m, void *v);
/*
* volume.c
*/
+#ifdef CONFIG_PROC_FS
extern const struct seq_operations fscache_volumes_seq_ops;
+#endif
struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
enum fscache_volume_trace where);
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index c8c7fe9e9a6e..3af3b08a9bb3 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -235,8 +235,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
{
struct fscache_write_request *wreq = priv;
- fscache_clear_page_bits(fscache_cres_cookie(&wreq->cache_resources),
- wreq->mapping, wreq->start, wreq->len,
+ fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
wreq->set_bits);
if (wreq->term_func)
@@ -296,7 +295,7 @@ abandon_end:
abandon_free:
kfree(wreq);
abandon:
- fscache_clear_page_bits(cookie, mapping, start, len, cond);
+ fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
term_func(term_func_priv, ret, false);
}
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c
index f2aa7dbad766..a058e0136bfe 100644
--- a/fs/fscache/volume.c
+++ b/fs/fscache/volume.c
@@ -143,7 +143,7 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate,
{
wait_var_event_timeout(&candidate->flags,
!fscache_is_acquire_pending(candidate), 20 * HZ);
- if (!fscache_is_acquire_pending(candidate)) {
+ if (fscache_is_acquire_pending(candidate)) {
pr_notice("Potential volume collision new=%08x old=%08x",
candidate->debug_id, collidee_debug_id);
fscache_stat(&fscache_n_volumes_collision);
@@ -182,7 +182,7 @@ static bool fscache_hash_volume(struct fscache_volume *candidate)
hlist_bl_add_head(&candidate->hash_link, h);
hlist_bl_unlock(h);
- if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags))
+ if (fscache_is_acquire_pending(candidate))
fscache_wait_on_volume_collision(candidate, collidee_debug_id);
return true;
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 27a890aa493a..fc9d2d9fd234 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -119,7 +119,7 @@ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
const char *fs_name;
int ret;
- if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!may_mount())
return -EPERM;
if (flags & ~FSOPEN_CLOEXEC)
@@ -162,7 +162,7 @@ SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags
unsigned int lookup_flags;
int ret;
- if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ if (!may_mount())
return -EPERM;
if ((flags & ~(FSPICK_CLOEXEC |
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 7cede9a3bc96..247ef4f76761 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -258,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
struct dentry *parent;
char name[32];
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return 0;
parent = fuse_control_sb->s_root;
@@ -296,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
int i;
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return;
for (i = fc->ctl_ndents - 1; i >= 0; i--) {
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index d7d3a7f06862..e23e802a8013 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
WARN_ON(fcd->nr_free_ranges <= 0);
fcd->nr_free_ranges--;
}
+ __kick_dmap_free_worker(fcd, 0);
spin_unlock(&fcd->lock);
- kick_dmap_free_worker(fcd, 0);
return dmap;
}
@@ -1241,8 +1241,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
id = dax_read_lock();
- nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL,
- NULL);
+ nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size),
+ DAX_ACCESS, NULL, NULL);
dax_read_unlock(id);
if (nr_pages < 0) {
pr_debug("dax_direct_access() returned %ld\n", nr_pages);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0e537e580dc1..51897427a534 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -730,14 +730,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
}
} else {
size_t off;
- err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+ err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
if (err < 0)
return err;
BUG_ON(!err);
cs->len = err;
cs->offset = off;
cs->pg = page;
- iov_iter_advance(cs->iter, err);
}
return lock_request(cs->req);
@@ -1356,7 +1355,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
if (!fud)
return -EPERM;
- if (!iter_is_iovec(to))
+ if (!user_backed_iter(to))
return -EINVAL;
fuse_copy_init(&cs, 1, to);
@@ -1949,7 +1948,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
if (!fud)
return -EPERM;
- if (!iter_is_iovec(from))
+ if (!user_backed_iter(from))
return -EINVAL;
fuse_copy_init(&cs, 0, from);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9ff27b8a9782..b585b04e815e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -11,6 +11,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/fs_context.h>
+#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -21,6 +22,11 @@
#include <linux/types.h>
#include <linux/kernel.h>
+static bool __read_mostly allow_sys_admin_access;
+module_param(allow_sys_admin_access, bool, 0644);
+MODULE_PARM_DESC(allow_sys_admin_access,
+ "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check");
+
static void fuse_advise_use_readdirplus(struct inode *dir)
{
struct fuse_inode *fi = get_fuse_inode(dir);
@@ -537,6 +543,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_file *ff;
void *security_ctx = NULL;
u32 security_ctxlen;
+ bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
@@ -561,7 +568,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.mode = mode;
inarg.umask = current_umask();
- if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
+ if (fm->fc->handle_killpriv_v2 && trunc &&
!(flags & O_EXCL) && !capable(CAP_FSETID)) {
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
@@ -623,6 +630,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
+ if (fm->fc->atomic_o_trunc && trunc)
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
}
return err;
@@ -1224,6 +1235,9 @@ int fuse_allow_current_process(struct fuse_conn *fc)
{
const struct cred *cred;
+ if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+ return 1;
+
if (fc->allow_other)
return current_in_userns(fc->user_ns);
@@ -1957,20 +1971,20 @@ void fuse_init_dir(struct inode *inode)
fi->rdc.version = 0;
}
-static int fuse_symlink_readpage(struct file *null, struct page *page)
+static int fuse_symlink_read_folio(struct file *null, struct folio *folio)
{
- int err = fuse_readlink_page(page->mapping->host, page);
+ int err = fuse_readlink_page(folio->mapping->host, &folio->page);
if (!err)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
static const struct address_space_operations fuse_symlink_aops = {
- .readpage = fuse_symlink_readpage,
+ .read_folio = fuse_symlink_read_folio,
};
void fuse_init_symlink(struct inode *inode)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f18d14d5fea1..1a3afd469e3a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -210,13 +210,9 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
- truncate_pagecache(inode, 0);
file_update_time(file);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
- } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
- invalidate_inode_pages2(inode->i_mapping);
}
-
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
@@ -239,30 +235,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
- if (is_wb_truncate || dax_truncate) {
+ if (is_wb_truncate || dax_truncate)
inode_lock(inode);
- fuse_set_nowrite(inode);
- }
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
- goto out;
+ goto out_inode_unlock;
}
+ if (is_wb_truncate || dax_truncate)
+ fuse_set_nowrite(inode);
+
err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
-out:
+ if (is_wb_truncate || dax_truncate)
+ fuse_release_nowrite(inode);
+ if (!err) {
+ struct fuse_file *ff = file->private_data;
+
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+ truncate_pagecache(inode, 0);
+ else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
+ }
if (dax_truncate)
filemap_invalidate_unlock(inode->i_mapping);
-
- if (is_wb_truncate | dax_truncate) {
- fuse_release_nowrite(inode);
+out_inode_unlock:
+ if (is_wb_truncate || dax_truncate)
inode_unlock(inode);
- }
return err;
}
@@ -338,6 +342,15 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * Dirty pages might remain despite write_inode_now() call from
+ * fuse_flush() due to writes racing with the close.
+ */
+ if (fc->writeback_cache)
+ write_inode_now(inode, 1);
+
fuse_release_common(file, false);
/* return value is ignored by VFS */
@@ -857,8 +870,9 @@ static int fuse_do_readpage(struct file *file, struct page *page)
return 0;
}
-static int fuse_readpage(struct file *file, struct page *page)
+static int fuse_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
int err;
@@ -1041,7 +1055,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb)
{
unsigned int flags = iocb->ki_filp->f_flags;
- if (iocb->ki_flags & IOCB_DSYNC)
+ if (iocb_is_dsync(iocb))
flags |= O_DSYNC;
if (iocb->ki_flags & IOCB_SYNC)
flags |= O_SYNC;
@@ -1174,7 +1188,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
break;
err = -ENOMEM;
- page = grab_cache_page_write_begin(mapping, index, 0);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
break;
@@ -1400,14 +1414,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
while (nbytes < *nbytesp && ap->num_pages < max_pages) {
unsigned npages;
size_t start;
- ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
+ ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
*nbytesp - nbytes,
max_pages - ap->num_pages,
&start);
if (ret < 0)
break;
- iov_iter_advance(ii, ret);
nbytes += ret;
ret += start;
@@ -1464,7 +1477,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
inode_unlock(inode);
}
- io->should_dirty = !write && iter_is_iovec(iter);
+ io->should_dirty = !write && user_backed_iter(iter);
while (count) {
ssize_t nres;
fl_owner_t owner = current->files;
@@ -2273,8 +2286,7 @@ out:
* but how to implement it without killing performance need more thinking.
*/
static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
@@ -2284,7 +2296,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
WARN_ON(!fc->writeback_cache);
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
goto error;
@@ -3175,7 +3187,7 @@ static const struct file_operations fuse_file_operations = {
};
static const struct address_space_operations fuse_file_aops = {
- .readpage = fuse_readpage,
+ .read_folio = fuse_read_folio,
.readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8c0665c5dff8..6b3beda16c1b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -180,6 +180,12 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_uid = make_kuid(fc->user_ns, attr->uid);
inode->i_gid = make_kgid(fc->user_ns, attr->gid);
inode->i_blocks = attr->blocks;
+
+ /* Sanitize nsecs */
+ attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1);
+ attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
+ attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
+
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
@@ -476,8 +482,14 @@ static void fuse_umount_begin(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
- if (!fc->no_force_umount)
- fuse_abort_conn(fc);
+ if (fc->no_force_umount)
+ return;
+
+ fuse_abort_conn(fc);
+
+ // Only retire block-device-based superblocks.
+ if (sb->s_bdev != NULL)
+ retire_super(sb);
}
static void fuse_send_destroy(struct fuse_mount *fm)
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 33cde4bbccdc..61d8afcb10a3 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -9,6 +9,17 @@
#include <linux/compat.h>
#include <linux/fileattr.h>
+static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args)
+{
+ ssize_t ret = fuse_simple_request(fm, args);
+
+ /* Translate ENOSYS, which shouldn't be returned from fs */
+ if (ret == -ENOSYS)
+ ret = -ENOTTY;
+
+ return ret;
+}
+
/*
* CUSE servers compiled on 32bit broke on 64bit kernels because the
* ABI was defined to be 'struct iovec' which is different on 32bit
@@ -259,7 +270,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.out_pages = true;
ap.args.out_argvar = true;
- transferred = fuse_simple_request(fm, &ap.args);
+ transferred = fuse_send_ioctl(fm, &ap.args);
err = transferred;
if (transferred < 0)
goto out;
@@ -393,7 +404,7 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.out_args[1].size = inarg.out_size;
args.out_args[1].value = ptr;
- err = fuse_simple_request(fm, &args);
+ err = fuse_send_ioctl(fm, &args);
if (!err) {
if (outarg.result < 0)
err = outarg.result;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 86b7dbb6a0d4..4d8d4f16c727 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -741,8 +741,7 @@ out:
}
/* Free virtqueues (device must already be reset) */
-static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
- struct virtio_fs *fs)
+static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
{
vdev->config->del_vqs(vdev);
}
@@ -752,11 +751,12 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
* offset.
*/
static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
- long nr_pages, void **kaddr, pfn_t *pfn)
+ long nr_pages, enum dax_access_mode mode,
+ void **kaddr, pfn_t *pfn)
{
struct virtio_fs *fs = dax_get_private(dax_dev);
phys_addr_t offset = PFN_PHYS(pgoff);
- size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+ size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff;
if (kaddr)
*kaddr = fs->window_kaddr + offset;
@@ -772,7 +772,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
long rc;
void *kaddr;
- rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+ rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr,
+ NULL);
if (rc < 0)
return rc;
memset(kaddr, 0, nr_pages << PAGE_SHIFT);
@@ -893,7 +894,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
out_vqs:
virtio_reset_device(vdev);
- virtio_fs_cleanup_vqs(vdev, fs);
+ virtio_fs_cleanup_vqs(vdev);
kfree(fs->vqs);
out:
@@ -925,7 +926,7 @@ static void virtio_fs_remove(struct virtio_device *vdev)
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
virtio_reset_device(vdev);
- virtio_fs_cleanup_vqs(vdev, fs);
+ virtio_fs_cleanup_vqs(vdev);
vdev->priv = NULL;
/* Put device reference on virtio_fs object */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9f31ce724..05bee80ac7de 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -82,31 +82,6 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
}
/**
- * gfs2_writepage - Write page for writeback mappings
- * @page: The page
- * @wbc: The writeback control
- */
-static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct iomap_writepage_ctx wpc = { };
-
- if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
- goto out;
- if (current->journal_info)
- goto redirty;
- return iomap_writepage(page, wbc, &wpc, &gfs2_writeback_ops);
-
-redirty:
- redirty_page_for_writepage(wbc, page);
-out:
- unlock_page(page);
- return 0;
-}
-
-/**
* gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page
* @page: The page to write
* @wbc: The writeback control
@@ -464,22 +439,26 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
return 0;
}
-
-static int __gfs2_readpage(void *file, struct page *page)
+/**
+ * gfs2_read_folio - read a folio from a file
+ * @file: The file to read
+ * @folio: The folio in the file
+ */
+static int gfs2_read_folio(struct file *file, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
if (!gfs2_is_jdata(ip) ||
- (i_blocksize(inode) == PAGE_SIZE && !page_has_buffers(page))) {
- error = iomap_readpage(page, &gfs2_iomap_ops);
+ (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
+ error = iomap_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
- error = stuffed_readpage(ip, page);
- unlock_page(page);
+ error = stuffed_readpage(ip, &folio->page);
+ folio_unlock(folio);
} else {
- error = mpage_readpage(page, gfs2_block_map);
+ error = mpage_read_folio(folio, gfs2_block_map);
}
if (unlikely(gfs2_withdrawn(sdp)))
@@ -489,17 +468,6 @@ static int __gfs2_readpage(void *file, struct page *page)
}
/**
- * gfs2_readpage - read a page of a file
- * @file: The file to read
- * @page: The page of the file
- */
-
-static int gfs2_readpage(struct file *file, struct page *page)
-{
- return __gfs2_readpage(file, page);
-}
-
-/**
* gfs2_internal_read - read an internal file
* @ip: The gfs2 inode
* @buf: The buffer to fill
@@ -523,7 +491,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
amt = size - copied;
if (offset + size > PAGE_SIZE)
amt = PAGE_SIZE - offset;
- page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+ page = read_cache_page(mapping, index, gfs2_read_folio, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
p = kmap_atomic(page);
@@ -698,38 +666,40 @@ out:
}
/**
- * gfs2_releasepage - free the metadata associated with a page
- * @page: the page that's being released
+ * gfs2_release_folio - free the metadata associated with a folio
+ * @folio: the folio that's being released
* @gfp_mask: passed from Linux VFS, ignored by us
*
- * Calls try_to_free_buffers() to free the buffers and put the page if the
+ * Calls try_to_free_buffers() to free the buffers and put the folio if the
* buffers can be released.
*
- * Returns: 1 if the page was put or else 0
+ * Returns: true if the folio was put or else false
*/
-int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
struct buffer_head *bh, *head;
struct gfs2_bufdata *bd;
- if (!page_has_buffers(page))
- return 0;
+ head = folio_buffers(folio);
+ if (!head)
+ return false;
/*
- * From xfs_vm_releasepage: mm accommodates an old ext3 case where
- * clean pages might not have had the dirty bit cleared. Thus, it can
- * send actual dirty pages to ->releasepage() via shrink_active_list().
+ * mm accommodates an old ext3 case where clean folios might
+ * not have had the dirty bit cleared. Thus, it can send actual
+ * dirty folios to ->release_folio() via shrink_active_list().
*
- * As a workaround, we skip pages that contain dirty buffers below.
- * Once ->releasepage isn't called on dirty pages anymore, we can warn
- * on dirty buffers like we used to here again.
+ * As a workaround, we skip folios that contain dirty buffers
+ * below. Once ->release_folio isn't called on dirty folios
+ * anymore, we can warn on dirty buffers like we used to here
+ * again.
*/
gfs2_log_lock(sdp);
- head = bh = page_buffers(page);
+ bh = head;
do {
if (atomic_read(&bh->b_count))
goto cannot_release;
@@ -739,9 +709,9 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh)))
goto cannot_release;
bh = bh->b_this_page;
- } while(bh != head);
+ } while (bh != head);
- head = bh = page_buffers(page);
+ bh = head;
do {
bd = bh->b_private;
if (bd) {
@@ -762,24 +732,23 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
} while (bh != head);
gfs2_log_unlock(sdp);
- return try_to_free_buffers(page);
+ return try_to_free_buffers(folio);
cannot_release:
gfs2_log_unlock(sdp);
- return 0;
+ return false;
}
static const struct address_space_operations gfs2_aops = {
- .writepage = gfs2_writepage,
.writepages = gfs2_writepages,
- .readpage = gfs2_readpage,
+ .read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
.dirty_folio = filemap_dirty_folio,
- .releasepage = iomap_releasepage,
+ .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
.direct_IO = noop_direct_IO,
- .migratepage = iomap_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -787,12 +756,12 @@ static const struct address_space_operations gfs2_aops = {
static const struct address_space_operations gfs2_jdata_aops = {
.writepage = gfs2_jdata_writepage,
.writepages = gfs2_jdata_writepages,
- .readpage = gfs2_readpage,
+ .read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
.dirty_folio = jdata_dirty_folio,
.bmap = gfs2_bmap,
.invalidate_folio = gfs2_invalidate_folio,
- .releasepage = gfs2_releasepage,
+ .release_folio = gfs2_release_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 39080b2d6cf8..3bdb2c668a71 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -310,9 +310,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
if (trylock_buffer(rabh)) {
if (!buffer_uptodate(rabh)) {
rabh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ,
- REQ_RAHEAD | REQ_META | REQ_PRIO,
- rabh);
+ submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
+ REQ_PRIO, rabh);
continue;
}
unlock_buffer(rabh);
@@ -1153,13 +1152,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (length != written && (iomap->flags & IOMAP_F_NEW)) {
/* Deallocate blocks that were just allocated. */
- loff_t blockmask = i_blocksize(inode) - 1;
- loff_t end = (pos + length) & ~blockmask;
+ loff_t hstart = round_up(pos + written, i_blocksize(inode));
+ loff_t hend = iomap->offset + iomap->length;
- pos = (pos + written + blockmask) & ~blockmask;
- if (pos < end) {
- truncate_pagecache_range(inode, pos, end - 1);
- punch_hole(ip, pos, end - pos);
+ if (hstart < hend) {
+ truncate_pagecache_range(inode, hstart, hend - 1);
+ punch_hole(ip, hstart, hend - hstart);
}
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 42b7dfffb5e7..54a6d17b8c25 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1508,9 +1508,8 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
continue;
}
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ,
- REQ_RAHEAD | REQ_META | REQ_PRIO,
- bh);
+ submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
+ REQ_PRIO, bh);
continue;
}
brelse(bh);
@@ -2017,7 +2016,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
l_blocks++;
}
- gfs2_rlist_alloc(&rlist);
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE);
for (x = 0; x < rlist.rl_rgrps; x++) {
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 756d05779200..cf40895233f5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -66,7 +66,7 @@ struct get_name_filldir {
char *name;
};
-static int get_name_filldir(struct dir_context *ctx, const char *name,
+static bool get_name_filldir(struct dir_context *ctx, const char *name,
int length, loff_t offset, u64 inum,
unsigned int type)
{
@@ -74,12 +74,12 @@ static int get_name_filldir(struct dir_context *ctx, const char *name,
container_of(ctx, struct get_name_filldir, ctx);
if (inum != gnfd->inum.no_addr)
- return 0;
+ return true;
memcpy(gnfd->name, name, length);
gnfd->name[length] = 0;
- return 1;
+ return false;
}
static int gfs2_get_name(struct dentry *parent, char *name,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 22b41acfbbc3..892006fbbb09 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -770,30 +770,27 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
-static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+static inline bool should_fault_in_pages(struct iov_iter *i,
+ struct kiocb *iocb,
size_t *prev_count,
size_t *window_size)
{
size_t count = iov_iter_count(i);
size_t size, offs;
- if (likely(!count))
+ if (!count)
return false;
- if (ret <= 0 && ret != -EFAULT)
- return false;
- if (!iter_is_iovec(i))
+ if (!user_backed_iter(i))
return false;
size = PAGE_SIZE;
- offs = offset_in_page(i->iov[0].iov_base + i->iov_offset);
+ offs = offset_in_page(iocb->ki_pos);
if (*prev_count != count || !*window_size) {
size_t nr_dirtied;
- size = ALIGN(offs + count, PAGE_SIZE);
- size = min_t(size_t, size, SZ_1M);
nr_dirtied = max(current->nr_dirtied_pause -
current->nr_dirtied, 8);
- size = min(size, nr_dirtied << PAGE_SHIFT);
+ size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT);
}
*prev_count = count;
@@ -807,7 +804,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
size_t prev_count = 0, window_size = 0;
- size_t written = 0;
+ size_t read = 0;
ssize_t ret;
/*
@@ -835,35 +832,33 @@ retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, written);
+ IOMAP_DIO_PARTIAL, NULL, read);
to->nofault = false;
pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
+ /* No increment (+=) because iomap_dio_rw returns a cumulative value. */
if (ret > 0)
- written = ret;
-
- if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
- size_t leftover;
+ read = ret;
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_writeable(to, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
goto retry;
- }
}
+out_unlock:
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
+ /* User space doesn't expect partial success. */
if (ret < 0)
return ret;
- return written;
+ return read;
}
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -873,7 +868,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
size_t prev_count = 0, window_size = 0;
- size_t read = 0;
+ size_t written = 0;
ssize_t ret;
/*
@@ -899,41 +894,39 @@ retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
/* Silently fall back to buffered I/O when writing beyond EOF */
if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
- goto out;
+ goto out_unlock;
from->nofault = true;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- IOMAP_DIO_PARTIAL, read);
+ IOMAP_DIO_PARTIAL, NULL, written);
from->nofault = false;
-
- if (ret == -ENOTBLK)
- ret = 0;
+ if (ret <= 0) {
+ if (ret == -ENOTBLK)
+ ret = 0;
+ if (ret != -EFAULT)
+ goto out_unlock;
+ }
+ /* No increment (+=) because iomap_dio_rw returns a cumulative value. */
if (ret > 0)
- read = ret;
-
- if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
- size_t leftover;
+ written = ret;
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_readable(from, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (window_size)
goto retry;
- }
}
-out:
+out_unlock:
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
+ /* User space doesn't expect partial success. */
if (ret < 0)
return ret;
- return read;
+ return written;
}
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -941,7 +934,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct gfs2_inode *ip;
struct gfs2_holder gh;
size_t prev_count = 0, window_size = 0;
- size_t written = 0;
+ size_t read = 0;
ssize_t ret;
/*
@@ -962,7 +955,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (ret >= 0) {
if (!iov_iter_count(to))
return ret;
- written = ret;
+ read = ret;
} else if (ret != -EFAULT) {
if (ret != -EAGAIN)
return ret;
@@ -975,32 +968,26 @@ retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
-retry_under_glock:
pagefault_disable();
ret = generic_file_read_iter(iocb, to);
pagefault_enable();
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
if (ret > 0)
- written += ret;
+ read += ret;
- if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
- size_t leftover;
-
- gfs2_holder_allow_demote(&gh);
- leftover = fault_in_iov_iter_writeable(to, window_size);
- gfs2_holder_disallow_demote(&gh);
- if (leftover != window_size) {
- if (gfs2_holder_queued(&gh))
- goto retry_under_glock;
- if (written)
- goto out_uninit;
+ if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(&gh);
+ window_size -= fault_in_iov_iter_writeable(to, window_size);
+ if (window_size)
goto retry;
- }
}
+out_unlock:
if (gfs2_holder_queued(&gh))
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
- return written ? written : ret;
+ return read ? read : ret;
}
static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
@@ -1014,7 +1001,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
struct gfs2_holder *statfs_gh = NULL;
size_t prev_count = 0, window_size = 0;
size_t orig_count = iov_iter_count(from);
- size_t read = 0;
+ size_t written = 0;
ssize_t ret;
/*
@@ -1032,10 +1019,18 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
retry:
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ window_size -= fault_in_iov_iter_readable(from, window_size);
+ if (!window_size) {
+ ret = -EFAULT;
+ goto out_uninit;
+ }
+ from->count = min(from->count, window_size);
+ }
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-retry_under_glock:
+
if (inode == sdp->sd_rindex) {
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -1052,37 +1047,28 @@ retry_under_glock:
current->backing_dev_info = NULL;
if (ret > 0) {
iocb->ki_pos += ret;
- read += ret;
+ written += ret;
}
if (inode == sdp->sd_rindex)
gfs2_glock_dq_uninit(statfs_gh);
- from->count = orig_count - read;
- if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
- size_t leftover;
-
- gfs2_holder_allow_demote(gh);
- leftover = fault_in_iov_iter_readable(from, window_size);
- gfs2_holder_disallow_demote(gh);
- if (leftover != window_size) {
- from->count = min(from->count, window_size - leftover);
- if (gfs2_holder_queued(gh))
- goto retry_under_glock;
- if (read && !(iocb->ki_flags & IOCB_DIRECT))
- goto out_uninit;
- goto retry;
- }
+ if (ret <= 0 && ret != -EFAULT)
+ goto out_unlock;
+
+ from->count = orig_count - written;
+ if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+ gfs2_glock_dq(gh);
+ goto retry;
}
out_unlock:
if (gfs2_holder_queued(gh))
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- if (statfs_gh)
- kfree(statfs_gh);
- from->count = orig_count - read;
- return read ? read : ret;
+ kfree(statfs_gh);
+ from->count = orig_count - written;
+ return written ? written : ret;
}
/**
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 630c6550eacf..41b6c89e4bf7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -127,9 +127,11 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
kfree(gl->gl_lksb.sb_lvbptr);
- if (gl->gl_ops->go_flags & GLOF_ASPACE)
- kmem_cache_free(gfs2_glock_aspace_cachep, gl);
- else
+ if (gl->gl_ops->go_flags & GLOF_ASPACE) {
+ struct gfs2_glock_aspace *gla =
+ container_of(gl, struct gfs2_glock_aspace, glock);
+ kmem_cache_free(gfs2_glock_aspace_cachep, gla);
+ } else
kmem_cache_free(gfs2_glock_cachep, gl);
}
@@ -403,10 +405,13 @@ static void do_error(struct gfs2_glock *gl, const int ret)
/**
* demote_incompat_holders - demote incompatible demoteable holders
* @gl: the glock we want to promote
- * @new_gh: the new holder to be promoted
+ * @current_gh: the newly promoted holder
+ *
+ * We're passing the newly promoted holder in @current_gh, but actually, any of
+ * the strong holders would do.
*/
static void demote_incompat_holders(struct gfs2_glock *gl,
- struct gfs2_holder *new_gh)
+ struct gfs2_holder *current_gh)
{
struct gfs2_holder *gh, *tmp;
@@ -422,8 +427,10 @@ static void demote_incompat_holders(struct gfs2_glock *gl,
*/
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
return;
+ if (gh == current_gh)
+ continue;
if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
- !may_grant(gl, new_gh, gh)) {
+ !may_grant(gl, current_gh, gh)) {
/*
* We should not recurse into do_promote because
* __gfs2_glock_dq only calls handle_callback,
@@ -476,8 +483,7 @@ find_first_strong_holder(struct gfs2_glock *gl)
* gfs2_instantiate - Call the glops instantiate function
* @gh: The glock holder
*
- * Returns: 0 if instantiate was successful, 2 if type specific operation is
- * underway, or error.
+ * Returns: 0 if instantiate was successful, or error.
*/
int gfs2_instantiate(struct gfs2_holder *gh)
{
@@ -487,7 +493,7 @@ int gfs2_instantiate(struct gfs2_holder *gh)
again:
if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags))
- return 0;
+ goto done;
/*
* Since we unlock the lockref lock, we set a flag to indicate
@@ -506,78 +512,55 @@ again:
goto again;
}
- ret = glops->go_instantiate(gh);
+ ret = glops->go_instantiate(gl);
if (!ret)
clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags);
clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags);
- return ret;
+ if (ret)
+ return ret;
+
+done:
+ if (glops->go_held)
+ return glops->go_held(gh);
+ return 0;
}
/**
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
- * Returns: 1 if there is a blocked holder at the head of the list, or 2
- * if a type specific operation is underway.
+ * Returns: 1 if there is a blocked holder at the head of the list
*/
static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_lockref.lock)
-__acquires(&gl->gl_lockref.lock)
{
- struct gfs2_holder *gh, *tmp, *first_gh;
+ struct gfs2_holder *gh, *current_gh;
bool incompat_holders_demoted = false;
- bool lock_released;
- int ret;
-restart:
- first_gh = find_first_strong_holder(gl);
- list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- lock_released = false;
+ current_gh = find_first_strong_holder(gl);
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
- if (!may_grant(gl, first_gh, gh)) {
+ if (!may_grant(gl, current_gh, gh)) {
/*
- * If we get here, it means we may not grant this holder for
- * some reason. If this holder is the head of the list, it
- * means we have a blocked holder at the head, so return 1.
+ * If we get here, it means we may not grant this
+ * holder for some reason. If this holder is at the
+ * head of the list, it means we have a blocked holder
+ * at the head, so return 1.
*/
if (list_is_first(&gh->gh_list, &gl->gl_holders))
return 1;
do_error(gl, 0);
break;
}
- if (!incompat_holders_demoted) {
- demote_incompat_holders(gl, first_gh);
- incompat_holders_demoted = true;
- first_gh = gh;
- }
- if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) &&
- !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) {
- lock_released = true;
- spin_unlock(&gl->gl_lockref.lock);
- ret = gfs2_instantiate(gh);
- spin_lock(&gl->gl_lockref.lock);
- if (ret) {
- if (ret == 1)
- return 2;
- gh->gh_error = ret;
- list_del_init(&gh->gh_list);
- trace_gfs2_glock_queue(gh, 0);
- gfs2_holder_wake(gh);
- goto restart;
- }
- }
set_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_promote(gh);
gfs2_holder_wake(gh);
- /*
- * If we released the gl_lockref.lock the holders list may have
- * changed. For that reason, we start again at the start of
- * the holders queue.
- */
- if (lock_released)
- goto restart;
+ if (!incompat_holders_demoted) {
+ current_gh = gh;
+ demote_incompat_holders(gl, current_gh);
+ incompat_holders_demoted = true;
+ }
}
return 0;
}
@@ -655,7 +638,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh;
unsigned state = ret & LM_OUT_ST_MASK;
- int rv;
spin_lock(&gl->gl_lockref.lock);
trace_gfs2_glock_state_change(gl, state);
@@ -713,6 +695,8 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
+ int rv;
+
spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl);
spin_lock(&gl->gl_lockref.lock);
@@ -721,13 +705,10 @@ retry:
goto out;
}
}
- rv = do_promote(gl);
- if (rv == 2)
- goto out_locked;
+ do_promote(gl);
}
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
-out_locked:
spin_unlock(&gl->gl_lockref.lock);
}
@@ -884,7 +865,6 @@ __releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
- int ret;
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
return;
@@ -903,18 +883,14 @@ __acquires(&gl->gl_lockref.lock)
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
- ret = do_promote(gl);
- if (ret == 0)
+ if (do_promote(gl) == 0)
goto out_unlock;
- if (ret == 2)
- goto out;
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
do_error(gl, 0); /* Fail queued try locks */
}
do_xmote(gl, gh, gl->gl_target);
-out:
return;
out_sched:
@@ -1159,7 +1135,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
.ln_sbd = sdp };
struct gfs2_glock *gl, *tmp;
struct address_space *mapping;
- struct kmem_cache *cachep;
int ret = 0;
gl = find_insert_glock(&name, NULL);
@@ -1170,20 +1145,24 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
if (!create)
return -ENOENT;
- if (glops->go_flags & GLOF_ASPACE)
- cachep = gfs2_glock_aspace_cachep;
- else
- cachep = gfs2_glock_cachep;
- gl = kmem_cache_alloc(cachep, GFP_NOFS);
- if (!gl)
- return -ENOMEM;
-
+ if (glops->go_flags & GLOF_ASPACE) {
+ struct gfs2_glock_aspace *gla =
+ kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS);
+ if (!gla)
+ return -ENOMEM;
+ gl = &gla->glock;
+ } else {
+ gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS);
+ if (!gl)
+ return -ENOMEM;
+ }
memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+ gl->gl_ops = glops;
if (glops->go_flags & GLOF_LVB) {
gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
if (!gl->gl_lksb.sb_lvbptr) {
- kmem_cache_free(cachep, gl);
+ gfs2_glock_dealloc(&gl->gl_rcu);
return -ENOMEM;
}
}
@@ -1197,7 +1176,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
- gl->gl_ops = glops;
gl->gl_dstamp = 0;
preempt_disable();
/* We use the global stats to estimate the initial per-glock stats */
@@ -1234,8 +1212,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
*glp = tmp;
out_free:
- kfree(gl->gl_lksb.sb_lvbptr);
- kmem_cache_free(cachep, gl);
+ gfs2_glock_dealloc(&gl->gl_rcu);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_glock_wait);
@@ -1311,6 +1288,25 @@ static void gfs2_glock_update_hold_time(struct gfs2_glock *gl,
}
/**
+ * gfs2_glock_holder_ready - holder is ready and its error code can be collected
+ * @gh: the glock holder
+ *
+ * Called when a glock holder no longer needs to be waited for because it is
+ * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has
+ * failed (gh_error != 0).
+ */
+
+int gfs2_glock_holder_ready(struct gfs2_holder *gh)
+{
+ if (gh->gh_error || (gh->gh_flags & GL_SKIP))
+ return gh->gh_error;
+ gh->gh_error = gfs2_instantiate(gh);
+ if (gh->gh_error)
+ gfs2_glock_dq(gh);
+ return gh->gh_error;
+}
+
+/**
* gfs2_glock_wait - wait on a glock acquisition
* @gh: the glock holder
*
@@ -1324,7 +1320,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
might_sleep();
wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
gfs2_glock_update_hold_time(gh->gh_gl, start_time);
- return gh->gh_error;
+ return gfs2_glock_holder_ready(gh);
}
static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs)
@@ -1352,7 +1348,6 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd;
int i, ret = 0, timeout = 0;
unsigned long start_time = jiffies;
- bool keep_waiting;
might_sleep();
/*
@@ -1362,53 +1357,33 @@ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs)
for (i = 0; i < num_gh; i++)
timeout += ghs[i].gh_gl->gl_hold_time << 1;
-wait_for_dlm:
if (!wait_event_timeout(sdp->sd_async_glock_wait,
- !glocks_pending(num_gh, ghs), timeout))
+ !glocks_pending(num_gh, ghs), timeout)) {
ret = -ESTALE; /* request timed out. */
+ goto out;
+ }
- /*
- * If dlm granted all our requests, we need to adjust the glock
- * minimum hold time values according to how long we waited.
- *
- * If our request timed out, we need to repeatedly release any held
- * glocks we acquired thus far to allow dlm to acquire the remaining
- * glocks without deadlocking. We cannot currently cancel outstanding
- * glock acquisitions.
- *
- * The HIF_WAIT bit tells us which requests still need a response from
- * dlm.
- *
- * If dlm sent us any errors, we return the first error we find.
- */
- keep_waiting = false;
for (i = 0; i < num_gh; i++) {
- /* Skip holders we have already dequeued below. */
- if (!gfs2_holder_queued(&ghs[i]))
- continue;
- /* Skip holders with a pending DLM response. */
- if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) {
- keep_waiting = true;
- continue;
- }
+ struct gfs2_holder *gh = &ghs[i];
+ int ret2;
- if (test_bit(HIF_HOLDER, &ghs[i].gh_iflags)) {
- if (ret == -ESTALE)
- gfs2_glock_dq(&ghs[i]);
- else
- gfs2_glock_update_hold_time(ghs[i].gh_gl,
- start_time);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+ gfs2_glock_update_hold_time(gh->gh_gl,
+ start_time);
}
+ ret2 = gfs2_glock_holder_ready(gh);
if (!ret)
- ret = ghs[i].gh_error;
+ ret = ret2;
}
- if (keep_waiting)
- goto wait_for_dlm;
+out:
+ if (ret) {
+ for (i = 0; i < num_gh; i++) {
+ struct gfs2_holder *gh = &ghs[i];
- /*
- * At this point, we've either acquired all locks or released them all.
- */
+ gfs2_glock_dq(gh);
+ }
+ }
return ret;
}
@@ -1487,10 +1462,10 @@ __acquires(&gl->gl_lockref.lock)
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
- struct gfs2_holder *first_gh;
+ struct gfs2_holder *current_gh;
- first_gh = find_first_strong_holder(gl);
- try_futile = !may_grant(gl, first_gh, gh);
+ current_gh = find_first_strong_holder(gl);
+ try_futile = !may_grant(gl, current_gh, gh);
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
goto fail;
@@ -1776,7 +1751,7 @@ static int glock_compare(const void *arg_a, const void *arg_b)
}
/**
- * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * nq_m_sync - synchronously acquire more than one glock in deadlock free order
* @num_gh: the number of structures
* @ghs: an array of struct gfs2_holder structures
* @p: placeholder for the holder structure to pass back
@@ -1797,8 +1772,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
for (x = 0; x < num_gh; x++) {
- p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
-
error = gfs2_glock_nq(p[x]);
if (error) {
while (x--)
@@ -1815,7 +1788,6 @@ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
* @num_gh: the number of structures
* @ghs: an array of struct gfs2_holder structures
*
- *
* Returns: 0 on success (all glocks acquired),
* errno on failure (no glocks acquired)
*/
@@ -1830,7 +1802,6 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
case 0:
return 0;
case 1:
- ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
return gfs2_glock_nq(ghs);
default:
if (num_gh <= 4)
@@ -2242,20 +2213,6 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
glock_hash_walk(dump_glock_func, sdp);
}
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
-{
- struct gfs2_glock *gl = ip->i_gl;
- int ret;
-
- ret = gfs2_truncatei_resume(ip);
- gfs2_glock_assert_withdraw(gl, ret == 0);
-
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- run_queue(gl, 1);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
static const char *state2str(unsigned state)
{
switch(state) {
@@ -2530,7 +2487,7 @@ int __init gfs2_glock_init(void)
return -ENOMEM;
}
- ret = register_shrinker(&glock_shrinker);
+ ret = register_shrinker(&glock_shrinker, "gfs2-glock");
if (ret) {
destroy_workqueue(gfs2_delete_workqueue);
destroy_workqueue(glock_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 4f8642301801..5aed8b500cf5 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -138,6 +138,11 @@ struct lm_lockops {
const match_table_t *lm_tokens;
};
+struct gfs2_glock_aspace {
+ struct gfs2_glock glock;
+ struct address_space mapping;
+};
+
extern struct workqueue_struct *gfs2_delete_workqueue;
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
@@ -179,8 +184,11 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
{
- if (gl->gl_ops->go_flags & GLOF_ASPACE)
- return (struct address_space *)(gl + 1);
+ if (gl->gl_ops->go_flags & GLOF_ASPACE) {
+ struct gfs2_glock_aspace *gla =
+ container_of(gl, struct gfs2_glock_aspace, glock);
+ return &gla->mapping;
+ }
return NULL;
}
@@ -205,6 +213,7 @@ extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
extern int gfs2_glock_poll(struct gfs2_holder *gh);
extern int gfs2_instantiate(struct gfs2_holder *gh);
+extern int gfs2_glock_holder_ready(struct gfs2_holder *gh);
extern int gfs2_glock_wait(struct gfs2_holder *gh);
extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq(struct gfs2_holder *gh);
@@ -265,7 +274,6 @@ extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl);
extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
extern void gfs2_glock_free(struct gfs2_glock *gl);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 392800f082a6..49210a2e7ce7 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -485,35 +485,33 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
* Returns: errno
*/
-static int inode_go_instantiate(struct gfs2_holder *gh)
+static int inode_go_instantiate(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip = gl->gl_object;
+
+ if (!ip) /* no inode to populate - read it in later */
+ return 0;
+
+ return gfs2_inode_refresh(ip);
+}
+
+static int inode_go_held(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
if (!ip) /* no inode to populate - read it in later */
- goto out;
-
- error = gfs2_inode_refresh(ip);
- if (error)
- goto out;
+ return 0;
if (gh->gh_state != LM_ST_DEFERRED)
inode_dio_wait(&ip->i_inode);
if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
(gl->gl_state == LM_ST_EXCLUSIVE) &&
- (gh->gh_state == LM_ST_EXCLUSIVE)) {
- spin_lock(&sdp->sd_trunc_lock);
- if (list_empty(&ip->i_trunc_list))
- list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
- spin_unlock(&sdp->sd_trunc_lock);
- wake_up(&sdp->sd_quota_wait);
- error = 1;
- }
+ (gh->gh_state == LM_ST_EXCLUSIVE))
+ error = gfs2_truncatei_resume(ip);
-out:
return error;
}
@@ -737,6 +735,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
.go_instantiate = inode_go_instantiate,
+ .go_held = inode_go_held,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8c00fb389ae5..d09d9892cd05 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -219,7 +219,8 @@ struct gfs2_glock_operations {
int (*go_xmote_bh)(struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
- int (*go_instantiate) (struct gfs2_holder *gh);
+ int (*go_instantiate) (struct gfs2_glock *gl);
+ int (*go_held)(struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
@@ -396,7 +397,6 @@ struct gfs2_inode {
atomic_t i_sizehint; /* hint of the write size */
struct rw_semaphore i_rw_mutex;
struct list_head i_ordered;
- struct list_head i_trunc_list;
__be64 *i_hash_cache;
u32 i_entries;
u32 i_diskflags;
@@ -784,8 +784,6 @@ struct gfs2_sbd {
struct mutex sd_quota_mutex;
struct mutex sd_quota_sync_mutex;
wait_queue_head_t sd_quota_wait;
- struct list_head sd_trunc_list;
- spinlock_t sd_trunc_lock;
unsigned int sd_quota_slots;
unsigned long *sd_quota_bitmap;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 7b2c1f390db7..0264d514dda7 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -12,7 +12,7 @@
#include <linux/mm.h>
#include "util.h"
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask);
extern int gfs2_internal_read(struct gfs2_inode *ip,
char *buf, loff_t *pos, unsigned size);
extern void gfs2_set_aops(struct inode *inode);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 2559a79cf14b..71911bf9ab34 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1058,7 +1058,7 @@ restart:
/*
* Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
- * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
+ * to accommodate the largest slot number. (NB dlm slot numbers start at 1,
* gfs2 jids start at 0, so jid = slot - 1)
*/
@@ -1302,7 +1302,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
memcpy(cluster, table, strlen(table) - strlen(fsname));
fsname++;
- flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+ flags = DLM_LSFL_NEWEXCL;
/*
* create/join lockspace
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f0ee3ff6f9a8..723639376ae2 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -131,7 +131,7 @@ __acquires(&sdp->sd_ail_lock)
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
- ret = generic_writepages(mapping, wbc);
+ ret = filemap_fdatawrite_wbc(mapping, wbc);
if (need_resched()) {
blk_finish_plug(plug);
cond_resched();
@@ -222,8 +222,7 @@ out:
spin_unlock(&sdp->sd_ail_lock);
blk_finish_plug(&plug);
if (ret) {
- gfs2_lm(sdp, "gfs2_ail1_start_one (generic_writepages) "
- "returned: %d\n", ret);
+ gfs2_lm(sdp, "gfs2_ail1_start_one returned: %d\n", ret);
gfs2_withdraw(sdp);
}
trace_gfs2_ail_flush(sdp, wbc, 0);
@@ -823,7 +822,7 @@ void gfs2_flush_revokes(struct gfs2_sbd *sdp)
void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
u64 seq, u32 tail, u32 lblock, u32 flags,
- int op_flags)
+ blk_opf_t op_flags)
{
struct gfs2_log_header *lh;
u32 hash, crc;
@@ -905,7 +904,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
{
- int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC;
+ blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC;
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
gfs2_assert_withdraw(sdp, (state != SFS_FROZEN));
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index fc905c2af53c..653cffcbf869 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -82,7 +82,7 @@ extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
unsigned int *extra_revokes);
extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
u64 seq, u32 tail, u32 lblock, u32 flags,
- int op_flags);
+ blk_opf_t op_flags);
extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
u32 type);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6ba51cbb94cf..1902413d5d12 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -238,7 +238,7 @@ static void gfs2_end_log_write(struct bio *bio)
* there is no pending bio, then this is a no-op.
*/
-void gfs2_log_submit_bio(struct bio **biop, int opf)
+void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf)
{
struct bio *bio = *biop;
if (bio) {
@@ -292,7 +292,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
*/
static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno,
- struct bio **biop, int op,
+ struct bio **biop, enum req_op op,
bio_end_io_t *end_io, bool flush)
{
struct bio *bio = *biop;
@@ -452,36 +452,36 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
* @head: The journal head to start from
* @done: If set, perform only cleanup, else search and set if found.
*
- * Find the page with 'index' in the journal's mapping. Search the page for
+ * Find the folio with 'index' in the journal's mapping. Search the folio for
* the journal head if requested (cleanup == false). Release refs on the
- * page so the page cache can reclaim it (put_page() twice). We grabbed a
- * reference on this page two times, first when we did a find_or_create_page()
- * to obtain the page to add it to the bio and second when we do a
- * find_get_page() here to get the page to wait on while I/O on it is being
+ * folio so the page cache can reclaim it. We grabbed a
+ * reference on this folio twice, first when we did a find_or_create_page()
+ * to obtain the folio to add it to the bio and second when we do a
+ * filemap_get_folio() here to get the folio to wait on while I/O on it is being
* completed.
- * This function is also used to free up a page we might've grabbed but not
+ * This function is also used to free up a folio we might've grabbed but not
* used. Maybe we added it to a bio, but not submitted it for I/O. Or we
* submitted the I/O, but we already found the jhead so we only need to drop
- * our references to the page.
+ * our references to the folio.
*/
static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
struct gfs2_log_header_host *head,
bool *done)
{
- struct page *page;
+ struct folio *folio;
- page = find_get_page(jd->jd_inode->i_mapping, index);
- wait_on_page_locked(page);
+ folio = filemap_get_folio(jd->jd_inode->i_mapping, index);
- if (PageError(page))
+ folio_wait_locked(folio);
+ if (folio_test_error(folio))
*done = true;
if (!*done)
- *done = gfs2_jhead_pg_srch(jd, head, page);
+ *done = gfs2_jhead_pg_srch(jd, head, &folio->page);
- put_page(page); /* Once for find_get_page */
- put_page(page); /* Once more for find_or_create_page */
+ /* filemap_get_folio() and the earlier find_or_create_page() */
+ folio_put_refs(folio, 2);
}
static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index f707601597dc..1412ffba1d44 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -16,7 +16,7 @@ extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn);
extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
struct page *page, unsigned size, unsigned offset,
u64 blkno);
-extern void gfs2_log_submit_bio(struct bio **biop, int opf);
+extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, bool keep_cache);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 28d0eb23e18e..14ae9de76277 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -38,7 +38,6 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
atomic_set(&ip->i_sizehint, 0);
init_rwsem(&ip->i_rw_mutex);
- INIT_LIST_HEAD(&ip->i_trunc_list);
INIT_LIST_HEAD(&ip->i_ordered);
ip->i_qadata = NULL;
gfs2_holder_mark_uninitialized(&ip->i_rgd_gh);
@@ -62,11 +61,10 @@ static void gfs2_init_glock_once(void *foo)
static void gfs2_init_gl_aspace_once(void *foo)
{
- struct gfs2_glock *gl = foo;
- struct address_space *mapping = (struct address_space *)(gl + 1);
+ struct gfs2_glock_aspace *gla = foo;
- gfs2_init_glock_once(gl);
- address_space_init_once(mapping);
+ gfs2_init_glock_once(&gla->glock);
+ address_space_init_once(&gla->mapping);
}
/**
@@ -104,8 +102,7 @@ static int __init init_gfs2_fs(void)
goto fail_cachep1;
gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
- sizeof(struct gfs2_glock) +
- sizeof(struct address_space),
+ sizeof(struct gfs2_glock_aspace),
0, 0, gfs2_init_gl_aspace_once);
if (!gfs2_glock_aspace_cachep)
@@ -150,7 +147,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_trans_cachep)
goto fail_cachep8;
- error = register_shrinker(&gfs2_qd_shrinker);
+ error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
if (error)
goto fail_shrinker;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index d8bd1d48bd78..7e70e0ba5a6c 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
{
struct buffer_head *bh, *head;
int nr_underway = 0;
- int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
+ blk_opf_t write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
BUG_ON(!PageLocked(page));
BUG_ON(!page_has_buffers(page));
@@ -75,7 +75,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(REQ_OP_WRITE, write_flags, bh);
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
nr_underway++;
}
bh = next;
@@ -92,14 +92,14 @@ const struct address_space_operations gfs2_meta_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
- .releasepage = gfs2_releasepage,
+ .release_folio = gfs2_release_folio,
};
const struct address_space_operations gfs2_rgrp_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
- .releasepage = gfs2_releasepage,
+ .release_folio = gfs2_release_folio,
};
/**
@@ -217,14 +217,13 @@ static void gfs2_meta_read_endio(struct bio *bio)
* Submit several consecutive buffer head I/O requests as a single bio I/O
* request. (See submit_bh_wbc.)
*/
-static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
- int num)
+static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num)
{
while (num > 0) {
struct buffer_head *bh = *bhs;
struct bio *bio;
- bio = bio_alloc(bh->b_bdev, num, op | op_flags, GFP_NOIO);
+ bio = bio_alloc(bh->b_bdev, num, opf, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
while (num > 0) {
bh = *bhs;
@@ -288,7 +287,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
}
}
- gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num);
+ gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
@@ -527,7 +526,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh);
+ ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &first_bh);
dblock++;
extlen--;
@@ -536,9 +535,8 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(REQ_OP_READ,
- REQ_RAHEAD | REQ_META | REQ_PRIO,
- 1, &bh);
+ ll_rw_block(REQ_OP_READ | REQ_RAHEAD | REQ_META |
+ REQ_PRIO, 1, &bh);
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 21880d72081a..d0a58cdd433a 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -40,9 +40,11 @@ extern const struct address_space_operations gfs2_rgrp_aops;
static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
{
struct inode *inode = mapping->host;
- if (mapping->a_ops == &gfs2_meta_aops)
- return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd;
- else if (mapping->a_ops == &gfs2_rgrp_aops)
+ if (mapping->a_ops == &gfs2_meta_aops) {
+ struct gfs2_glock_aspace *gla =
+ container_of(mapping, struct gfs2_glock_aspace, mapping);
+ return gla->glock.gl_name.ln_sbd;
+ } else if (mapping->a_ops == &gfs2_rgrp_aops)
return container_of(mapping, struct gfs2_sbd, sd_aspace);
else
return inode->i_sb->s_fs_info;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c9b423c874a3..549879929c84 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -106,8 +106,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mutex_init(&sdp->sd_quota_mutex);
mutex_init(&sdp->sd_quota_sync_mutex);
init_waitqueue_head(&sdp->sd_quota_wait);
- INIT_LIST_HEAD(&sdp->sd_trunc_list);
- spin_lock_init(&sdp->sd_trunc_lock);
spin_lock_init(&sdp->sd_bitmap_lock);
INIT_LIST_HEAD(&sdp->sd_sc_inodes_list);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index be0997e24d60..f201eaf59d0d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -365,11 +365,12 @@ static void slot_put(struct gfs2_quota_data *qd)
static int bh_get(struct gfs2_quota_data *qd)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
- struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+ struct inode *inode = sdp->sd_qc_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
unsigned int block, offset;
struct buffer_head *bh;
+ struct iomap iomap = { };
int error;
- struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
mutex_lock(&sdp->sd_quota_mutex);
@@ -381,11 +382,17 @@ static int bh_get(struct gfs2_quota_data *qd)
block = qd->qd_slot / sdp->sd_qc_per_block;
offset = qd->qd_slot % sdp->sd_qc_per_block;
- bh_map.b_size = BIT(ip->i_inode.i_blkbits);
- error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
+ error = gfs2_iomap_get(inode,
+ (loff_t)block << inode->i_blkbits,
+ i_blocksize(inode), &iomap);
if (error)
goto fail;
- error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh);
+ error = -ENOENT;
+ if (iomap.type != IOMAP_MAPPED)
+ goto fail;
+
+ error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits,
+ DIO_WAIT, 0, &bh);
if (error)
goto fail;
error = -EIO;
@@ -443,9 +450,8 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
{
- struct gfs2_quota_data *qd = NULL;
+ struct gfs2_quota_data *qd = NULL, *iter;
int error;
- int found = 0;
*qdp = NULL;
@@ -454,15 +460,13 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
spin_lock(&qd_lock);
- list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
- found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen);
- if (found)
+ list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) {
+ if (qd_check_sync(sdp, iter, &sdp->sd_quota_sync_gen)) {
+ qd = iter;
break;
+ }
}
- if (!found)
- qd = NULL;
-
spin_unlock(&qd_lock);
if (qd) {
@@ -531,34 +535,42 @@ static void qdsb_put(struct gfs2_quota_data *qd)
*/
int gfs2_qa_get(struct gfs2_inode *ip)
{
- int error = 0;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- down_write(&ip->i_rw_mutex);
+ spin_lock(&inode->i_lock);
if (ip->i_qadata == NULL) {
- ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
- if (!ip->i_qadata) {
- error = -ENOMEM;
- goto out;
- }
+ struct gfs2_qadata *tmp;
+
+ spin_unlock(&inode->i_lock);
+ tmp = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&inode->i_lock);
+ if (ip->i_qadata == NULL)
+ ip->i_qadata = tmp;
+ else
+ kmem_cache_free(gfs2_qadata_cachep, tmp);
}
ip->i_qadata->qa_ref++;
-out:
- up_write(&ip->i_rw_mutex);
- return error;
+ spin_unlock(&inode->i_lock);
+ return 0;
}
void gfs2_qa_put(struct gfs2_inode *ip)
{
- down_write(&ip->i_rw_mutex);
+ struct inode *inode = &ip->i_inode;
+
+ spin_lock(&inode->i_lock);
if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) {
kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
ip->i_qadata = NULL;
}
- up_write(&ip->i_rw_mutex);
+ spin_unlock(&inode->i_lock);
}
int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
@@ -734,7 +746,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
if (PageUptodate(page))
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
+ ll_rw_block(REQ_OP_READ | REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
goto unlock_out;
@@ -1505,25 +1517,6 @@ static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
}
}
-static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
-{
- struct gfs2_inode *ip;
-
- while(1) {
- ip = NULL;
- spin_lock(&sdp->sd_trunc_lock);
- if (!list_empty(&sdp->sd_trunc_list)) {
- ip = list_first_entry(&sdp->sd_trunc_list,
- struct gfs2_inode, i_trunc_list);
- list_del_init(&ip->i_trunc_list);
- }
- spin_unlock(&sdp->sd_trunc_lock);
- if (ip == NULL)
- return;
- gfs2_glock_finish_truncate(ip);
- }
-}
-
void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) {
if (!sdp->sd_statfs_force_sync) {
sdp->sd_statfs_force_sync = 1;
@@ -1546,7 +1539,6 @@ int gfs2_quotad(void *data)
unsigned long quotad_timeo = 0;
unsigned long t = 0;
DEFINE_WAIT(wait);
- int empty;
while (!kthread_should_stop()) {
@@ -1567,19 +1559,13 @@ int gfs2_quotad(void *data)
quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
- /* Check for & recover partially truncated inodes */
- quotad_check_trunc_list(sdp);
-
try_to_freeze();
bypass:
t = min(quotad_timeo, statfs_timeo);
prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
- spin_lock(&sdp->sd_trunc_lock);
- empty = list_empty(&sdp->sd_trunc_list);
- spin_unlock(&sdp->sd_trunc_lock);
- if (empty && !sdp->sd_statfs_force_sync)
+ if (!sdp->sd_statfs_force_sync)
t -= schedule_timeout(t);
else
t = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 016ed1b2ca1d..2bb085a72e8e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -55,17 +55,16 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
struct list_head *head = &jd->jd_revoke_list;
- struct gfs2_revoke_replay *rr;
- int found = 0;
+ struct gfs2_revoke_replay *rr = NULL, *iter;
- list_for_each_entry(rr, head, rr_list) {
- if (rr->rr_blkno == blkno) {
- found = 1;
+ list_for_each_entry(iter, head, rr_list) {
+ if (iter->rr_blkno == blkno) {
+ rr = iter;
break;
}
}
- if (found) {
+ if (rr) {
rr->rr_where = where;
return 0;
}
@@ -83,18 +82,17 @@ int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
- struct gfs2_revoke_replay *rr;
+ struct gfs2_revoke_replay *rr = NULL, *iter;
int wrap, a, b, revoke;
- int found = 0;
- list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
- if (rr->rr_blkno == blkno) {
- found = 1;
+ list_for_each_entry(iter, &jd->jd_revoke_list, rr_list) {
+ if (iter->rr_blkno == blkno) {
+ rr = iter;
break;
}
}
- if (!found)
+ if (!rr)
return 0;
wrap = (rr->rr_where < jd->jd_replay_tail);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 801ad9f4f2be..f602fb844951 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1196,9 +1196,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
* Returns: errno
*/
-int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
+int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl)
{
- struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_sbd *sdp = rgd->rd_sbd;
unsigned int length = rgd->rd_length;
@@ -1315,7 +1314,7 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
u64 blk;
sector_t start = 0;
sector_t nr_blks = 0;
- int rv;
+ int rv = -EIO;
unsigned int x;
u32 trimmed = 0;
u8 diff;
@@ -1371,7 +1370,7 @@ fail:
if (sdp->sd_args.ar_discard)
fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
sdp->sd_args.ar_discard = 0;
- return -EIO;
+ return rv;
}
/**
@@ -1386,7 +1385,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
{
struct inode *inode = file_inode(filp);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
+ struct block_device *bdev = sdp->sd_vfs->s_bdev;
struct buffer_head *bh;
struct gfs2_rgrpd *rgd;
struct gfs2_rgrpd *rgd_end;
@@ -1405,7 +1404,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
return -EROFS;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (copy_from_user(&r, argp, sizeof(r)))
@@ -1418,8 +1417,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
start = r.start >> bs_shift;
end = start + (r.len >> bs_shift);
minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
- minlen = max_t(u64, minlen,
- q->limits.discard_granularity) >> bs_shift;
+ minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift;
if (end <= start || minlen > sdp->sd_max_rg_data)
return -EINVAL;
@@ -2721,12 +2719,15 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
* gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
* and initialize an array of glock holders for them
* @rlist: the list of resource groups
+ * @state: the state we're requesting
+ * @flags: the modifier flags
*
* FIXME: Don't use NOFAIL
*
*/
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
+ unsigned int state, u16 flags)
{
unsigned int x;
@@ -2734,8 +2735,8 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
sizeof(struct gfs2_holder),
GFP_NOFS | __GFP_NOFAIL);
for (x = 0; x < rlist->rl_rgrps; x++)
- gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE,
- LM_FLAG_NODE_SCOPE, &rlist->rl_ghs[x]);
+ gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, state, flags,
+ &rlist->rl_ghs[x]);
}
/**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 46dd94e9e085..00b30cf893af 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
-extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh);
+extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
@@ -64,7 +64,8 @@ struct gfs2_rgrp_list {
extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
u64 block);
-extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist,
+ unsigned int state, u16 flags);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bdb773e5c88f..b5b0f285b27f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1196,7 +1196,7 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
gfs2_glock_dq(gh);
return false;
}
- return true;
+ return gfs2_glock_holder_ready(gh) == 0;
}
/**
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 0c5650fe1fd1..f6a66050380e 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1313,7 +1313,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
else
goto out;
- gfs2_rlist_alloc(&rlist);
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE);
for (x = 0; x < rlist.rl_rgrps; x++) {
rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index c0a73a6ffb28..c83fd0e8404d 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -296,10 +296,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page = read_mapping_page(mapping, block++, NULL);
if (IS_ERR(page))
goto fail;
- if (PageError(page)) {
- put_page(page);
- goto fail;
- }
node->page[i] = page;
}
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 263d5028d9d1..3f7e9bef9874 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -491,10 +491,10 @@ void hfs_file_truncate(struct inode *inode)
/* XXX: Can use generic_cont_expand? */
size = inode->i_size - 1;
- res = pagecache_write_begin(NULL, mapping, size+1, 0, 0,
- &page, &fsdata);
+ res = hfs_write_begin(NULL, mapping, size + 1, 0, &page,
+ &fsdata);
if (!res) {
- res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
+ res = generic_write_end(NULL, mapping, size + 1, 0, 0,
page, fsdata);
}
if (res)
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index b8eb0322a3e5..68d0305880f7 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -201,6 +201,8 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;
+int hfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata);
extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
extern int hfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 55f45e9b4930..c4526f16355d 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -34,9 +34,9 @@ static int hfs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, hfs_get_block, wbc);
}
-static int hfs_readpage(struct file *file, struct page *page)
+static int hfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, hfs_get_block);
+ return block_read_full_folio(folio, hfs_get_block);
}
static void hfs_write_failed(struct address_space *mapping, loff_t to)
@@ -49,14 +49,13 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int hfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+int hfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
hfs_get_block,
&HFS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -70,14 +69,15 @@ static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hfs_get_block);
}
-static int hfs_releasepage(struct page *page, gfp_t mask)
+static bool hfs_release_folio(struct folio *folio, gfp_t mask)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct hfs_btree *tree;
struct hfs_bnode *node;
u32 nidx;
- int i, res = 1;
+ int i;
+ bool res = true;
switch (inode->i_ino) {
case HFS_EXT_CNID:
@@ -88,27 +88,27 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
break;
default:
BUG();
- return 0;
+ return false;
}
if (!tree)
- return 0;
+ return false;
if (tree->node_size >= PAGE_SIZE) {
- nidx = page->index >> (tree->node_size_shift - PAGE_SHIFT);
+ nidx = folio->index >> (tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
;
else if (atomic_read(&node->refcnt))
- res = 0;
+ res = false;
if (res && node) {
hfs_bnode_unhash(node);
hfs_bnode_free(node);
}
spin_unlock(&tree->hash_lock);
} else {
- nidx = page->index << (PAGE_SHIFT - tree->node_size_shift);
+ nidx = folio->index << (PAGE_SHIFT - tree->node_size_shift);
i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
do {
@@ -116,7 +116,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
if (!node)
continue;
if (atomic_read(&node->refcnt)) {
- res = 0;
+ res = false;
break;
}
hfs_bnode_unhash(node);
@@ -124,7 +124,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
} while (--i && nidx < tree->node_count);
spin_unlock(&tree->hash_lock);
}
- return res ? try_to_free_buffers(page) : 0;
+ return res ? try_to_free_buffers(folio) : false;
}
static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
@@ -161,18 +161,18 @@ static int hfs_writepages(struct address_space *mapping,
const struct address_space_operations hfs_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfs_readpage,
+ .read_folio = hfs_read_folio,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
.bmap = hfs_bmap,
- .releasepage = hfs_releasepage,
+ .release_folio = hfs_release_folio,
};
const struct address_space_operations hfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfs_readpage,
+ .read_folio = hfs_read_folio,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 177fae4e6581..a5ab00e54220 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -447,10 +447,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page = read_mapping_page(mapping, block, NULL);
if (IS_ERR(page))
goto fail;
- if (PageError(page)) {
- put_page(page);
- goto fail;
- }
node->page[i] = page;
}
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 7054a542689f..721f779b4ec3 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -557,12 +557,12 @@ void hfsplus_file_truncate(struct inode *inode)
void *fsdata;
loff_t size = inode->i_size;
- res = pagecache_write_begin(NULL, mapping, size, 0, 0,
- &page, &fsdata);
+ res = hfsplus_write_begin(NULL, mapping, size, 0,
+ &page, &fsdata);
if (res)
return;
- res = pagecache_write_end(NULL, mapping, size,
- 0, 0, page, fsdata);
+ res = generic_write_end(NULL, mapping, size, 0, 0,
+ page, fsdata);
if (res < 0)
return;
mark_inode_dirty(inode);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 1798949f269b..a5db2e3b2980 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -468,6 +468,8 @@ extern const struct address_space_operations hfsplus_aops;
extern const struct address_space_operations hfsplus_btree_aops;
extern const struct dentry_operations hfsplus_dentry_operations;
+int hfsplus_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata);
struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode);
void hfsplus_delete_inode(struct inode *inode);
@@ -523,7 +525,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len,
/* wrapper.c */
int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf,
- void **data, int op, int op_flags);
+ void **data, blk_opf_t opf);
int hfsplus_read_wrapper(struct super_block *sb);
/*
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 446a816aa8e1..aeab83ed1c9c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -23,9 +23,9 @@
#include "hfsplus_raw.h"
#include "xattr.h"
-static int hfsplus_readpage(struct file *file, struct page *page)
+static int hfsplus_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, hfsplus_get_block);
+ return block_read_full_folio(folio, hfsplus_get_block);
}
static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
@@ -43,14 +43,13 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+int hfsplus_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
hfsplus_get_block,
&HFSPLUS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -64,14 +63,15 @@ static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hfsplus_get_block);
}
-static int hfsplus_releasepage(struct page *page, gfp_t mask)
+static bool hfsplus_release_folio(struct folio *folio, gfp_t mask)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct hfs_btree *tree;
struct hfs_bnode *node;
u32 nidx;
- int i, res = 1;
+ int i;
+ bool res = true;
switch (inode->i_ino) {
case HFSPLUS_EXT_CNID:
@@ -85,26 +85,26 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
break;
default:
BUG();
- return 0;
+ return false;
}
if (!tree)
- return 0;
+ return false;
if (tree->node_size >= PAGE_SIZE) {
- nidx = page->index >>
+ nidx = folio->index >>
(tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
;
else if (atomic_read(&node->refcnt))
- res = 0;
+ res = false;
if (res && node) {
hfs_bnode_unhash(node);
hfs_bnode_free(node);
}
spin_unlock(&tree->hash_lock);
} else {
- nidx = page->index <<
+ nidx = folio->index <<
(PAGE_SHIFT - tree->node_size_shift);
i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
@@ -113,7 +113,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
if (!node)
continue;
if (atomic_read(&node->refcnt)) {
- res = 0;
+ res = false;
break;
}
hfs_bnode_unhash(node);
@@ -121,7 +121,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
} while (--i && nidx < tree->node_count);
spin_unlock(&tree->hash_lock);
}
- return res ? try_to_free_buffers(page) : 0;
+ return res ? try_to_free_buffers(folio) : false;
}
static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
@@ -158,18 +158,18 @@ static int hfsplus_writepages(struct address_space *mapping,
const struct address_space_operations hfsplus_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfsplus_readpage,
+ .read_folio = hfsplus_read_folio,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
.bmap = hfsplus_bmap,
- .releasepage = hfsplus_releasepage,
+ .release_folio = hfsplus_release_folio,
};
const struct address_space_operations hfsplus_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hfsplus_readpage,
+ .read_folio = hfsplus_read_folio,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 63164ebc52fa..9ec21664eda6 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -112,8 +112,7 @@ static int hfs_parse_new_pmap(struct super_block *sb, void *buf,
if ((u8 *)pm - (u8 *)buf >= buf_size) {
res = hfsplus_submit_bio(sb,
*part_start + HFS_PMAP_BLK + i,
- buf, (void **)&pm, REQ_OP_READ,
- 0);
+ buf, (void **)&pm, REQ_OP_READ);
if (res)
return res;
}
@@ -137,7 +136,7 @@ int hfs_part_find(struct super_block *sb,
return -ENOMEM;
res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK,
- buf, &data, REQ_OP_READ, 0);
+ buf, &data, REQ_OP_READ);
if (res)
goto out;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 8479add998b5..122ed89ebf9f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
- sbi->s_vhdr_buf, NULL, REQ_OP_WRITE,
+ sbi->s_vhdr_buf, NULL, REQ_OP_WRITE |
REQ_SYNC);
if (!error)
error = error2;
@@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + sbi->sect_count - 2,
- sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE,
+ sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE |
REQ_SYNC);
if (!error)
error2 = error;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 0b8ad6586df5..0b791adf02e5 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -45,8 +45,9 @@ struct hfsplus_wd {
* will work correctly.
*/
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
- void *buf, void **data, int op, int op_flags)
+ void *buf, void **data, blk_opf_t opf)
{
+ const enum req_op op = opf & REQ_OP_MASK;
struct bio *bio;
int ret = 0;
u64 io_size;
@@ -63,10 +64,10 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
- bio = bio_alloc(sb->s_bdev, 1, op | op_flags, GFP_NOIO);
+ bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO);
bio->bi_iter.bi_sector = sector;
- if (op != WRITE && data)
+ if (op != REQ_OP_WRITE && data)
*data = (u8 *)buf + offset;
while (io_size > 0) {
@@ -184,7 +185,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
reread:
error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR,
sbi->s_vhdr_buf, (void **)&sbi->s_vhdr,
- REQ_OP_READ, 0);
+ REQ_OP_READ);
if (error)
goto out_free_backup_vhdr;
@@ -216,8 +217,7 @@ reread:
error = hfsplus_submit_bio(sb, part_start + part_size - 2,
sbi->s_backup_vhdr_buf,
- (void **)&sbi->s_backup_vhdr, REQ_OP_READ,
- 0);
+ (void **)&sbi->s_backup_vhdr, REQ_OP_READ);
if (error)
goto out_free_backup_vhdr;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 14f9ac973a2e..07881b76d42f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -416,15 +416,15 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
if (err != count) {
- ClearPageUptodate(page);
+ if (err >= 0)
+ err = -EIO;
+ mapping_set_error(mapping, err);
goto out;
}
if (base > inode->i_size)
inode->i_size = base;
- if (PageError(page))
- ClearPageError(page);
err = 0;
out:
@@ -434,8 +434,9 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
return err;
}
-static int hostfs_readpage(struct file *file, struct page *page)
+static int hostfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
char *buffer;
loff_t start = page_offset(page);
int bytes_read, ret = 0;
@@ -463,12 +464,12 @@ static int hostfs_readpage(struct file *file, struct page *page)
}
static int hostfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
- *pagep = grab_cache_page_write_begin(mapping, index, flags);
+ *pagep = grab_cache_page_write_begin(mapping, index);
if (!*pagep)
return -ENOMEM;
return 0;
@@ -504,7 +505,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
static const struct address_space_operations hostfs_aops = {
.writepage = hostfs_writepage,
- .readpage = hostfs_readpage,
+ .read_folio = hostfs_read_folio,
.dirty_folio = filemap_dirty_folio,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 99493a23c5d0..f7547a62c81f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -158,9 +158,9 @@ static const struct iomap_ops hpfs_iomap_ops = {
.iomap_begin = hpfs_iomap_begin,
};
-static int hpfs_readpage(struct file *file, struct page *page)
+static int hpfs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, hpfs_get_block);
+ return mpage_read_folio(folio, hpfs_get_block);
}
static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -194,13 +194,13 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
}
static int hpfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -247,7 +247,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
const struct address_space_operations hpfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = hpfs_readpage,
+ .read_folio = hpfs_read_folio,
.writepage = hpfs_writepage,
.readahead = hpfs_readahead,
.writepages = hpfs_writepages,
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d73f8a67168e..15fc63276caa 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -479,8 +479,9 @@ out:
return err;
}
-static int hpfs_symlink_readpage(struct file *file, struct page *page)
+static int hpfs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
char *link = page_address(page);
struct inode *i = page->mapping->host;
struct fnode *fnode;
@@ -508,7 +509,7 @@ fail:
}
const struct address_space_operations hpfs_symlink_aops = {
- .readpage = hpfs_symlink_readpage
+ .read_folio = hpfs_symlink_read_folio
};
static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 99c7477cee5c..f7a5b5124d8a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -11,7 +11,6 @@
#include <linux/thread_info.h>
#include <asm/current.h>
-#include <linux/sched/signal.h> /* remove ASAP */
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -40,7 +39,6 @@
#include <linux/uaccess.h>
#include <linux/sched/mm.h>
-static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -108,16 +106,6 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
}
#endif
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
@@ -195,7 +183,6 @@ out:
* Called under mmap_write_lock(mm).
*/
-#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long
hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -206,7 +193,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr, len, flags);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -222,7 +209,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -237,20 +224,22 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr, len, flags);
addr = vm_unmapped_area(&info);
}
return addr;
}
-static unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long
+generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
+ const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -266,7 +255,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -282,41 +271,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
}
-#endif
-static size_t
-hugetlbfs_read_actor(struct page *page, unsigned long offset,
- struct iov_iter *to, unsigned long size)
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+static unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
- size_t copied = 0;
- int i, chunksize;
-
- /* Find which 4k chunk and offset with in that chunk */
- i = offset >> PAGE_SHIFT;
- offset = offset & ~PAGE_MASK;
-
- while (size) {
- size_t n;
- chunksize = PAGE_SIZE;
- if (offset)
- chunksize -= offset;
- if (chunksize > size)
- chunksize = size;
- n = copy_page_to_iter(&page[i], offset, chunksize, to);
- copied += n;
- if (n != chunksize)
- return copied;
- offset = 0;
- size -= chunksize;
- i++;
- }
- return copied;
+ return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
}
+#endif
/*
* Support for read() - Find the page attached to f_mapping and copy out the
- * data. Its *very* similar to generic_file_buffered_read(), we can't use that
- * since it has PAGE_SIZE assumptions.
+ * data. This provides functionality similar to filemap_read().
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -363,7 +331,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
/*
* We have the page, copy it to user space buffer.
*/
- copied = hugetlbfs_read_actor(page, offset, to, nr);
+ copied = copy_page_to_iter(page, offset, nr, to);
put_page(page);
}
offset += copied;
@@ -382,7 +350,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
static int hugetlbfs_write_begin(struct file *file,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
return -EINVAL;
@@ -404,7 +372,8 @@ static void remove_huge_page(struct page *page)
}
static void
-hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
+ zap_flags_t zap_flags)
{
struct vm_area_struct *vma;
@@ -438,7 +407,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
}
unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
- NULL);
+ NULL, zap_flags);
}
}
@@ -469,25 +438,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
next = start;
- while (next < end) {
- /*
- * When no more pages are found, we are done.
- */
- if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); ++i) {
- struct page *page = pvec.pages[i];
+ while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ struct folio *folio = fbatch.folios[i];
u32 hash = 0;
- index = page->index;
+ index = folio->index;
if (!truncate_op) {
/*
* Only need to hold the fault mutex in the
@@ -500,15 +463,15 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
}
/*
- * If page is mapped, it was faulted in after being
+ * If folio is mapped, it was faulted in after being
* unmapped in caller. Unmap (again) now after taking
* the fault mutex. The mutex will prevent faults
- * until we finish removing the page.
+ * until we finish removing the folio.
*
* This race can only happen in the hole punch case.
* Getting here in a truncate operation is a bug.
*/
- if (unlikely(page_mapped(page))) {
+ if (unlikely(folio_mapped(folio))) {
BUG_ON(truncate_op);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -516,11 +479,12 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vmdelete_list(&mapping->i_mmap,
index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h));
+ (index + 1) * pages_per_huge_page(h),
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
}
- lock_page(page);
+ folio_lock(folio);
/*
* We must free the huge page and remove from page
* cache (remove_huge_page) BEFORE removing the
@@ -530,8 +494,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
* the subpool and global reserve usage count can need
* to be adjusted.
*/
- VM_BUG_ON(HPageRestoreReserve(page));
- remove_huge_page(page);
+ VM_BUG_ON(HPageRestoreReserve(&folio->page));
+ remove_huge_page(&folio->page);
freed++;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode,
@@ -539,11 +503,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
hugetlb_fix_reserve_counts(inode);
}
- unlock_page(page);
+ folio_unlock(folio);
if (!truncate_op)
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
- huge_pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
@@ -582,46 +546,85 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_mmap_lock_write(mapping);
i_size_write(inode, offset);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
}
+static void hugetlbfs_zero_partial_page(struct hstate *h,
+ struct address_space *mapping,
+ loff_t start,
+ loff_t end)
+{
+ pgoff_t idx = start >> huge_page_shift(h);
+ struct folio *folio;
+
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio)
+ return;
+
+ start = start & ~huge_page_mask(h);
+ end = end & ~huge_page_mask(h);
+ if (!end)
+ end = huge_page_size(h);
+
+ folio_zero_segment(folio, (size_t)start, (size_t)end);
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
loff_t hpage_size = huge_page_size(h);
loff_t hole_start, hole_end;
/*
- * For hole punch round up the beginning offset of the hole and
- * round down the end.
+ * hole_start and hole_end indicate the full pages within the hole.
*/
hole_start = round_up(offset, hpage_size);
hole_end = round_down(offset + len, hpage_size);
- if (hole_end > hole_start) {
- struct address_space *mapping = inode->i_mapping;
- struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ inode_lock(inode);
- inode_lock(inode);
+ /* protected by i_rwsem */
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ inode_unlock(inode);
+ return -EPERM;
+ }
- /* protected by i_rwsem */
- if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
- inode_unlock(inode);
- return -EPERM;
- }
+ i_mmap_lock_write(mapping);
- i_mmap_lock_write(mapping);
+ /* If range starts before first full page, zero partial page. */
+ if (offset < hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ offset, min(offset + len, hole_start));
+
+ /* Unmap users of full pages in the hole. */
+ if (hole_end > hole_start) {
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
- inode_unlock(inode);
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT, 0);
}
+ /* If range extends beyond last full page, zero partial page. */
+ if ((offset + len) > hole_end && (offset + len) > hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ hole_end, offset + len);
+
+ i_mmap_unlock_write(mapping);
+
+ /* Remove full pages from the file. */
+ if (hole_end > hole_start)
+ remove_inode_hugepages(inode, hole_start, hole_end);
+
+ inode_unlock(inode);
+
return 0;
}
@@ -746,7 +749,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
SetHPageMigratable(page);
/*
- * unlock_page because locked by add_to_page_cache()
+ * unlock_page because locked by huge_add_to_page_cache()
* put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
@@ -957,28 +960,33 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
return error;
}
-static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page,
+#ifdef CONFIG_MIGRATION
+static int hugetlbfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
int rc;
- rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ rc = migrate_huge_page_move_mapping(mapping, dst, src);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- if (hugetlb_page_subpool(page)) {
- hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page));
- hugetlb_set_page_subpool(page, NULL);
+ if (hugetlb_page_subpool(&src->page)) {
+ hugetlb_set_page_subpool(&dst->page,
+ hugetlb_page_subpool(&src->page));
+ hugetlb_set_page_subpool(&src->page, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(dst, src);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
}
+#else
+#define hugetlbfs_migrate_folio NULL
+#endif
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
@@ -1042,17 +1050,17 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = huge_page_size(h);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock);
- /* If no limits set, just report 0 for max/free/used
+ /* If no limits set, just report 0 or -1 for max/free/used
* blocks, like simple_statfs() */
if (sbinfo->spool) {
long free_pages;
- spin_lock(&sbinfo->spool->lock);
+ spin_lock_irq(&sbinfo->spool->lock);
buf->f_blocks = sbinfo->spool->max_hpages;
free_pages = sbinfo->spool->max_hpages
- sbinfo->spool->used_hpages;
buf->f_bavail = buf->f_bfree = free_pages;
- spin_unlock(&sbinfo->spool->lock);
+ spin_unlock_irq(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
@@ -1145,7 +1153,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
.dirty_folio = noop_dirty_folio,
- .migratepage = hugetlbfs_migrate_page,
+ .migrate_folio = hugetlbfs_migrate_folio,
.error_remove_page = hugetlbfs_error_remove_page,
};
@@ -1269,7 +1277,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
ps = memparse(param->string, &rest);
ctx->hstate = size_to_hstate(ps);
if (!ctx->hstate) {
- pr_err("Unsupported page size %lu MB\n", ps >> 20);
+ pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
return -EINVAL;
}
return 0;
@@ -1345,7 +1353,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
/*
* Allocate and initialize subpool if maximum or minimum size is
* specified. Any needed reservations (for minimum size) are taken
- * taken when the subpool is created.
+ * when the subpool is created.
*/
if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
sbinfo->spool = hugepage_new_subpool(ctx->hstate,
@@ -1515,7 +1523,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
}
if (IS_ERR(mnt))
pr_err("Cannot mount internal hugetlbfs for page size %luK",
- huge_page_size(h) >> 10);
+ huge_page_size(h) / SZ_1K);
return mnt;
}
diff --git a/fs/inode.c b/fs/inode.c
index 9d9b422504d1..b608528efd3a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -27,7 +27,7 @@
* Inode locking rules:
*
* inode->i_lock protects:
- * inode->i_state, inode->i_hash, __iget()
+ * inode->i_state, inode->i_hash, __iget(), inode->i_io_list
* Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
* inode->i_sb->s_inode_list_lock protects:
@@ -192,8 +192,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_wb_frn_history = 0;
#endif
- if (security_inode_alloc(inode))
- goto out;
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -228,11 +226,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
+
+ if (unlikely(security_inode_alloc(inode)))
+ return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
-out:
- return -ENOMEM;
}
EXPORT_SYMBOL(inode_init_always);
@@ -422,6 +421,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
+ INIT_LIST_HEAD(&inode->i_sb_list);
__address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
}
@@ -604,7 +604,7 @@ void clear_inode(struct inode *inode)
{
/*
* We have to cycle the i_pages lock here because reclaim can be in the
- * process of removing the last page (in __delete_from_page_cache())
+ * process of removing the last page (in __filemap_remove_folio())
* and we must not free the mapping under it.
*/
xa_lock_irq(&inode->i_data.i_pages);
@@ -1021,7 +1021,6 @@ struct inode *new_inode_pseudo(struct super_block *sb)
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
- INIT_LIST_HEAD(&inode->i_sb_list);
}
return inode;
}
@@ -1165,7 +1164,6 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
- bool creating = inode->i_state & I_CREATING;
again:
spin_lock(&inode_hash_lock);
@@ -1199,7 +1197,12 @@ again:
inode->i_state |= I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- if (!creating)
+
+ /*
+ * Add inode to the sb list if it's not already. It has I_NEW at this
+ * point, so it should be safe to test i_sb_list locklessly.
+ */
+ if (list_empty(&inode->i_sb_list))
inode_sb_list_add(inode);
unlock:
spin_unlock(&inode_hash_lock);
@@ -2010,67 +2013,59 @@ static int __remove_privs(struct user_namespace *mnt_userns,
return notify_change(mnt_userns, dentry, &newattrs, NULL);
}
-/*
- * Remove special file priviledges (suid, capabilities) when file is written
- * to or truncated.
- */
-int file_remove_privs(struct file *file)
+static int __file_remove_privs(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
- int kill;
int error = 0;
+ int kill;
- /*
- * Fast path for nothing security related.
- * As well for non-regular files, e.g. blkdev inodes.
- * For example, blkdev_write_iter() might get here
- * trying to remove privs which it is not allowed to.
- */
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
if (kill < 0)
return kill;
- if (kill)
+
+ if (kill) {
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
+ }
+
if (!error)
inode_has_no_xattr(inode);
-
return error;
}
-EXPORT_SYMBOL(file_remove_privs);
/**
- * file_update_time - update mtime and ctime time
- * @file: file accessed
+ * file_remove_privs - remove special file privileges (suid, capabilities)
+ * @file: file to remove privileges from
*
- * Update the mtime and ctime members of an inode and mark the inode
- * for writeback. Note that this function is meant exclusively for
- * usage in the file write path of filesystems, and filesystems may
- * choose to explicitly ignore update via this function with the
- * S_NOCMTIME inode flag, e.g. for network filesystem where these
- * timestamps are handled by the server. This can return an error for
- * file systems who need to allocate space in order to update an inode.
+ * When file is modified by a write or truncation ensure that special
+ * file privileges are removed.
+ *
+ * Return: 0 on success, negative errno on failure.
*/
+int file_remove_privs(struct file *file)
+{
+ return __file_remove_privs(file, 0);
+}
+EXPORT_SYMBOL(file_remove_privs);
-int file_update_time(struct file *file)
+static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
{
- struct inode *inode = file_inode(file);
- struct timespec64 now;
int sync_it = 0;
- int ret;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
+ if (!timespec64_equal(&inode->i_mtime, now))
sync_it = S_MTIME;
- if (!timespec64_equal(&inode->i_ctime, &now))
+ if (!timespec64_equal(&inode->i_ctime, now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2079,37 +2074,127 @@ int file_update_time(struct file *file)
if (!sync_it)
return 0;
- /* Finally allowed to write? Takes lock. */
- if (__mnt_want_write_file(file))
- return 0;
+ return sync_it;
+}
- ret = inode_update_time(inode, &now, sync_it);
- __mnt_drop_write_file(file);
+static int __file_update_time(struct file *file, struct timespec64 *now,
+ int sync_mode)
+{
+ int ret = 0;
+ struct inode *inode = file_inode(file);
+
+ /* try to update time settings */
+ if (!__mnt_want_write_file(file)) {
+ ret = inode_update_time(inode, now, sync_mode);
+ __mnt_drop_write_file(file);
+ }
return ret;
}
+
+/**
+ * file_update_time - update mtime and ctime time
+ * @file: file accessed
+ *
+ * Update the mtime and ctime members of an inode and mark the inode for
+ * writeback. Note that this function is meant exclusively for usage in
+ * the file write path of filesystems, and filesystems may choose to
+ * explicitly ignore updates via this function with the _NOCMTIME inode
+ * flag, e.g. for network filesystem where these imestamps are handled
+ * by the server. This can return an error for file systems who need to
+ * allocate space in order to update an inode.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int file_update_time(struct file *file)
+{
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
+
+ ret = inode_needs_update_time(inode, &now);
+ if (ret <= 0)
+ return ret;
+
+ return __file_update_time(file, &now, ret);
+}
EXPORT_SYMBOL(file_update_time);
-/* Caller must hold the file's inode lock */
-int file_modified(struct file *file)
+/**
+ * file_modified_flags - handle mandated vfs changes when modifying a file
+ * @file: file that was modified
+ * @flags: kiocb flags
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * If IOCB_NOWAIT is set, special file privileges will not be removed and
+ * time settings will not be updated. It will return -EAGAIN.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int file_modified_flags(struct file *file, int flags)
{
- int err;
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct timespec64 now = current_time(inode);
/*
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
- err = file_remove_privs(file);
- if (err)
- return err;
+ ret = __file_remove_privs(file, flags);
+ if (ret)
+ return ret;
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
- return file_update_time(file);
+ ret = inode_needs_update_time(inode, &now);
+ if (ret <= 0)
+ return ret;
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ return __file_update_time(file, &now, ret);
+}
+
+/**
+ * file_modified - handle mandated vfs changes when modifying a file
+ * @file: file that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int file_modified(struct file *file)
+{
+ return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);
+/**
+ * kiocb_modified - handle mandated vfs changes when modifying a file
+ * @iocb: iocb that was modified
+ *
+ * When file has been modified ensure that special
+ * file privileges are removed and time settings are updated.
+ *
+ * Context: Caller must hold the file's inode lock.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kiocb_modified(struct kiocb *iocb)
+{
+ return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
+}
+EXPORT_SYMBOL_GPL(kiocb_modified);
+
int inode_needs_sync(struct inode *inode)
{
if (IS_SYNC(inode))
@@ -2246,10 +2331,6 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
/* Directories are special, and always inherit S_ISGID */
if (S_ISDIR(mode))
mode |= S_ISGID;
- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
- !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
- !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
- mode &= ~S_ISGID;
} else
inode_fsgid_set(inode, mnt_userns);
inode->i_mode = mode;
@@ -2405,3 +2486,33 @@ struct timespec64 current_time(struct inode *inode)
return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);
+
+/**
+ * mode_strip_sgid - handle the sgid bit for non-directories
+ * @mnt_userns: User namespace of the mount the inode was created from
+ * @dir: parent directory inode
+ * @mode: mode of the file to be created in @dir
+ *
+ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
+ * raised and @dir has the S_ISGID bit raised ensure that the caller is
+ * either in the group of the parent directory or they have CAP_FSETID
+ * in their user namespace and are privileged over the parent directory.
+ * In all other cases, strip the S_ISGID bit from @mode.
+ *
+ * Return: the new mode to use for the file
+ */
+umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
+ const struct inode *dir, umode_t mode)
+{
+ if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
+ return mode;
+ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
+ return mode;
+ if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
+ return mode;
+ if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+ return mode;
+
+ return mode & ~S_ISGID;
+}
+EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/internal.h b/fs/internal.h
index 08503dc68d2b..6f0386b34fae 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -16,6 +16,7 @@ struct shrink_control;
struct fs_context;
struct user_namespace;
struct pipe_inode_info;
+struct iov_iter;
/*
* block/bdev.c
@@ -62,7 +63,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
-int may_linkat(struct user_namespace *mnt_userns, struct path *link);
+int may_linkat(struct user_namespace *mnt_userns, const struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
@@ -84,6 +85,7 @@ extern int __mnt_want_write_file(struct file *);
extern void __mnt_drop_write_file(struct file *);
extern void dissolve_on_fput(struct vfsmount *);
+extern bool may_mount(void);
int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page);
@@ -100,6 +102,16 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
extern struct file *alloc_empty_file(int, const struct cred *);
extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+static inline void put_file_access(struct file *file)
+{
+ if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_dec(file->f_inode);
+ } else if (file->f_mode & FMODE_WRITER) {
+ put_write_access(file->f_inode);
+ __mnt_drop_write(file->f_path.mnt);
+ }
+}
+
/*
* super.c
*/
@@ -125,7 +137,7 @@ extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
-extern int __close_fd_get_file(unsigned int fd, struct file **res);
+extern struct file *__close_fd_get_file(unsigned int fd);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
@@ -191,3 +203,34 @@ long splice_file_to_pipe(struct file *in,
struct pipe_inode_info *opipe,
loff_t *offset,
size_t len, unsigned int flags);
+
+/*
+ * fs/xattr.c:
+ */
+struct xattr_name {
+ char name[XATTR_NAME_MAX + 1];
+};
+
+struct xattr_ctx {
+ /* Value of attribute */
+ union {
+ const void __user *cvalue;
+ void __user *value;
+ };
+ void *kvalue;
+ size_t size;
+ /* Attribute name */
+ struct xattr_name *kname;
+ unsigned int flags;
+};
+
+
+ssize_t do_getxattr(struct user_namespace *mnt_userns,
+ struct dentry *d,
+ struct xattr_ctx *ctx);
+
+int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
+int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct xattr_ctx *ctx);
+
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
diff --git a/fs/io-wq.c b/fs/io-wq.c
deleted file mode 100644
index 32aeb2c581c5..000000000000
--- a/fs/io-wq.c
+++ /dev/null
@@ -1,1424 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Basic worker thread pool for io_uring
- *
- * Copyright (C) 2019 Jens Axboe
- *
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/sched/signal.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/rculist_nulls.h>
-#include <linux/cpu.h>
-#include <linux/task_work.h>
-#include <linux/audit.h>
-#include <uapi/linux/io_uring.h>
-
-#include "io-wq.h"
-
-#define WORKER_IDLE_TIMEOUT (5 * HZ)
-
-enum {
- IO_WORKER_F_UP = 1, /* up and active */
- IO_WORKER_F_RUNNING = 2, /* account as running */
- IO_WORKER_F_FREE = 4, /* worker on free list */
- IO_WORKER_F_BOUND = 8, /* is doing bounded work */
-};
-
-enum {
- IO_WQ_BIT_EXIT = 0, /* wq exiting */
-};
-
-enum {
- IO_ACCT_STALLED_BIT = 0, /* stalled on hash */
-};
-
-/*
- * One for each thread in a wqe pool
- */
-struct io_worker {
- refcount_t ref;
- unsigned flags;
- struct hlist_nulls_node nulls_node;
- struct list_head all_list;
- struct task_struct *task;
- struct io_wqe *wqe;
-
- struct io_wq_work *cur_work;
- struct io_wq_work *next_work;
- raw_spinlock_t lock;
-
- struct completion ref_done;
-
- unsigned long create_state;
- struct callback_head create_work;
- int create_index;
-
- union {
- struct rcu_head rcu;
- struct work_struct work;
- };
-};
-
-#if BITS_PER_LONG == 64
-#define IO_WQ_HASH_ORDER 6
-#else
-#define IO_WQ_HASH_ORDER 5
-#endif
-
-#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
-
-struct io_wqe_acct {
- unsigned nr_workers;
- unsigned max_workers;
- int index;
- atomic_t nr_running;
- raw_spinlock_t lock;
- struct io_wq_work_list work_list;
- unsigned long flags;
-};
-
-enum {
- IO_WQ_ACCT_BOUND,
- IO_WQ_ACCT_UNBOUND,
- IO_WQ_ACCT_NR,
-};
-
-/*
- * Per-node worker thread pool
- */
-struct io_wqe {
- raw_spinlock_t lock;
- struct io_wqe_acct acct[IO_WQ_ACCT_NR];
-
- int node;
-
- struct hlist_nulls_head free_list;
- struct list_head all_list;
-
- struct wait_queue_entry wait;
-
- struct io_wq *wq;
- struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
-
- cpumask_var_t cpu_mask;
-};
-
-/*
- * Per io_wq state
- */
-struct io_wq {
- unsigned long state;
-
- free_work_fn *free_work;
- io_wq_work_fn *do_work;
-
- struct io_wq_hash *hash;
-
- atomic_t worker_refs;
- struct completion worker_done;
-
- struct hlist_node cpuhp_node;
-
- struct task_struct *task;
-
- struct io_wqe *wqes[];
-};
-
-static enum cpuhp_state io_wq_online;
-
-struct io_cb_cancel_data {
- work_cancel_fn *fn;
- void *data;
- int nr_running;
- int nr_pending;
- bool cancel_all;
-};
-
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
-static void io_wqe_dec_running(struct io_worker *worker);
-static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
- struct io_wqe_acct *acct,
- struct io_cb_cancel_data *match);
-static void create_worker_cb(struct callback_head *cb);
-static void io_wq_cancel_tw_create(struct io_wq *wq);
-
-static bool io_worker_get(struct io_worker *worker)
-{
- return refcount_inc_not_zero(&worker->ref);
-}
-
-static void io_worker_release(struct io_worker *worker)
-{
- if (refcount_dec_and_test(&worker->ref))
- complete(&worker->ref_done);
-}
-
-static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound)
-{
- return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
-}
-
-static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
- struct io_wq_work *work)
-{
- return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND));
-}
-
-static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker)
-{
- return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND);
-}
-
-static void io_worker_ref_put(struct io_wq *wq)
-{
- if (atomic_dec_and_test(&wq->worker_refs))
- complete(&wq->worker_done);
-}
-
-static void io_worker_cancel_cb(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
-
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&worker->wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&worker->wqe->lock);
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
-}
-
-static bool io_task_worker_match(struct callback_head *cb, void *data)
-{
- struct io_worker *worker;
-
- if (cb->func != create_worker_cb)
- return false;
- worker = container_of(cb, struct io_worker, create_work);
- return worker == data;
-}
-
-static void io_worker_exit(struct io_worker *worker)
-{
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
-
- while (1) {
- struct callback_head *cb = task_work_cancel_match(wq->task,
- io_task_worker_match, worker);
-
- if (!cb)
- break;
- io_worker_cancel_cb(worker);
- }
-
- io_worker_release(worker);
- wait_for_completion(&worker->ref_done);
-
- raw_spin_lock(&wqe->lock);
- if (worker->flags & IO_WORKER_F_FREE)
- hlist_nulls_del_rcu(&worker->nulls_node);
- list_del_rcu(&worker->all_list);
- raw_spin_unlock(&wqe->lock);
- io_wqe_dec_running(worker);
- worker->flags = 0;
- preempt_disable();
- current->flags &= ~PF_IO_WORKER;
- preempt_enable();
-
- kfree_rcu(worker, rcu);
- io_worker_ref_put(wqe->wq);
- do_exit(0);
-}
-
-static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
-{
- bool ret = false;
-
- raw_spin_lock(&acct->lock);
- if (!wq_list_empty(&acct->work_list) &&
- !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- ret = true;
- raw_spin_unlock(&acct->lock);
-
- return ret;
-}
-
-/*
- * Check head of free list for an available worker. If one isn't available,
- * caller must create one.
- */
-static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
- struct io_wqe_acct *acct)
- __must_hold(RCU)
-{
- struct hlist_nulls_node *n;
- struct io_worker *worker;
-
- /*
- * Iterate free_list and see if we can find an idle worker to
- * activate. If a given worker is on the free_list but in the process
- * of exiting, keep trying.
- */
- hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
- if (!io_worker_get(worker))
- continue;
- if (io_wqe_get_acct(worker) != acct) {
- io_worker_release(worker);
- continue;
- }
- if (wake_up_process(worker->task)) {
- io_worker_release(worker);
- return true;
- }
- io_worker_release(worker);
- }
-
- return false;
-}
-
-/*
- * We need a worker. If we find a free one, we're good. If not, and we're
- * below the max number of workers, create one.
- */
-static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
-{
- /*
- * Most likely an attempt to queue unbounded work on an io_wq that
- * wasn't setup with any unbounded workers.
- */
- if (unlikely(!acct->max_workers))
- pr_warn_once("io-wq is not configured for unbound workers");
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers >= acct->max_workers) {
- raw_spin_unlock(&wqe->lock);
- return true;
- }
- acct->nr_workers++;
- raw_spin_unlock(&wqe->lock);
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- return create_io_worker(wqe->wq, wqe, acct->index);
-}
-
-static void io_wqe_inc_running(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-
- atomic_inc(&acct->nr_running);
-}
-
-static void create_worker_cb(struct callback_head *cb)
-{
- struct io_worker *worker;
- struct io_wq *wq;
- struct io_wqe *wqe;
- struct io_wqe_acct *acct;
- bool do_create = false;
-
- worker = container_of(cb, struct io_worker, create_work);
- wqe = worker->wqe;
- wq = wqe->wq;
- acct = &wqe->acct[worker->create_index];
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers < acct->max_workers) {
- acct->nr_workers++;
- do_create = true;
- }
- raw_spin_unlock(&wqe->lock);
- if (do_create) {
- create_io_worker(wq, wqe, worker->create_index);
- } else {
- atomic_dec(&acct->nr_running);
- io_worker_ref_put(wq);
- }
- clear_bit_unlock(0, &worker->create_state);
- io_worker_release(worker);
-}
-
-static bool io_queue_worker_create(struct io_worker *worker,
- struct io_wqe_acct *acct,
- task_work_func_t func)
-{
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
-
- /* raced with exit, just ignore create call */
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
- goto fail;
- if (!io_worker_get(worker))
- goto fail;
- /*
- * create_state manages ownership of create_work/index. We should
- * only need one entry per worker, as the worker going to sleep
- * will trigger the condition, and waking will clear it once it
- * runs the task_work.
- */
- if (test_bit(0, &worker->create_state) ||
- test_and_set_bit_lock(0, &worker->create_state))
- goto fail_release;
-
- atomic_inc(&wq->worker_refs);
- init_task_work(&worker->create_work, func);
- worker->create_index = acct->index;
- if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
- /*
- * EXIT may have been set after checking it above, check after
- * adding the task_work and remove any creation item if it is
- * now set. wq exit does that too, but we can have added this
- * work item after we canceled in io_wq_exit_workers().
- */
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
- io_wq_cancel_tw_create(wq);
- io_worker_ref_put(wq);
- return true;
- }
- io_worker_ref_put(wq);
- clear_bit_unlock(0, &worker->create_state);
-fail_release:
- io_worker_release(worker);
-fail:
- atomic_dec(&acct->nr_running);
- io_worker_ref_put(wq);
- return false;
-}
-
-static void io_wqe_dec_running(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
-
- if (!(worker->flags & IO_WORKER_F_UP))
- return;
-
- if (!atomic_dec_and_test(&acct->nr_running))
- return;
- if (!io_acct_run_queue(acct))
- return;
-
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(worker, acct, create_worker_cb);
-}
-
-/*
- * Worker will start processing some work. Move it to the busy list, if
- * it's currently on the freelist
- */
-static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
-{
- if (worker->flags & IO_WORKER_F_FREE) {
- worker->flags &= ~IO_WORKER_F_FREE;
- raw_spin_lock(&wqe->lock);
- hlist_nulls_del_init_rcu(&worker->nulls_node);
- raw_spin_unlock(&wqe->lock);
- }
-}
-
-/*
- * No work, worker going to sleep. Move to freelist, and unuse mm if we
- * have one attached. Dropping the mm may potentially sleep, so we drop
- * the lock in that case and return success. Since the caller has to
- * retry the loop in that case (we changed task state), we don't regrab
- * the lock if we return success.
- */
-static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
- __must_hold(wqe->lock)
-{
- if (!(worker->flags & IO_WORKER_F_FREE)) {
- worker->flags |= IO_WORKER_F_FREE;
- hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
- }
-}
-
-static inline unsigned int io_get_work_hash(struct io_wq_work *work)
-{
- return work->flags >> IO_WQ_HASH_SHIFT;
-}
-
-static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
-{
- struct io_wq *wq = wqe->wq;
- bool ret = false;
-
- spin_lock_irq(&wq->hash->wait.lock);
- if (list_empty(&wqe->wait.entry)) {
- __add_wait_queue(&wq->hash->wait, &wqe->wait);
- if (!test_bit(hash, &wq->hash->map)) {
- __set_current_state(TASK_RUNNING);
- list_del_init(&wqe->wait.entry);
- ret = true;
- }
- }
- spin_unlock_irq(&wq->hash->wait.lock);
- return ret;
-}
-
-static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
- struct io_worker *worker)
- __must_hold(acct->lock)
-{
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work, *tail;
- unsigned int stall_hash = -1U;
- struct io_wqe *wqe = worker->wqe;
-
- wq_list_for_each(node, prev, &acct->work_list) {
- unsigned int hash;
-
- work = container_of(node, struct io_wq_work, list);
-
- /* not hashed, can run anytime */
- if (!io_wq_is_hashed(work)) {
- wq_list_del(&acct->work_list, node, prev);
- return work;
- }
-
- hash = io_get_work_hash(work);
- /* all items with this hash lie in [work, tail] */
- tail = wqe->hash_tail[hash];
-
- /* hashed, can run if not already running */
- if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
- wqe->hash_tail[hash] = NULL;
- wq_list_cut(&acct->work_list, &tail->list, prev);
- return work;
- }
- if (stall_hash == -1U)
- stall_hash = hash;
- /* fast forward to a next hash, for-each will fix up @prev */
- node = &tail->list;
- }
-
- if (stall_hash != -1U) {
- bool unstalled;
-
- /*
- * Set this before dropping the lock to avoid racing with new
- * work being added and clearing the stalled bit.
- */
- set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&acct->lock);
- unstalled = io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&acct->lock);
- if (unstalled) {
- clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- if (wq_has_sleeper(&wqe->wq->hash->wait))
- wake_up(&wqe->wq->hash->wait);
- }
- }
-
- return NULL;
-}
-
-static bool io_flush_signals(void)
-{
- if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
- __set_current_state(TASK_RUNNING);
- clear_notify_signal();
- if (task_work_pending(current))
- task_work_run();
- return true;
- }
- return false;
-}
-
-static void io_assign_current_work(struct io_worker *worker,
- struct io_wq_work *work)
-{
- if (work) {
- io_flush_signals();
- cond_resched();
- }
-
- raw_spin_lock(&worker->lock);
- worker->cur_work = work;
- worker->next_work = NULL;
- raw_spin_unlock(&worker->lock);
-}
-
-static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
-
-static void io_worker_handle_work(struct io_worker *worker)
-{
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
- bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
-
- do {
- struct io_wq_work *work;
-
- /*
- * If we got some work, mark us as busy. If we didn't, but
- * the list isn't empty, it means we stalled on hashed work.
- * Mark us stalled so we don't keep looking for work when we
- * can't make progress, any work completion or insertion will
- * clear the stalled flag.
- */
- raw_spin_lock(&acct->lock);
- work = io_get_next_work(acct, worker);
- raw_spin_unlock(&acct->lock);
- if (work) {
- __io_worker_busy(wqe, worker);
-
- /*
- * Make sure cancelation can find this, even before
- * it becomes the active work. That avoids a window
- * where the work has been removed from our general
- * work list, but isn't yet discoverable as the
- * current work item for this worker.
- */
- raw_spin_lock(&worker->lock);
- worker->next_work = work;
- raw_spin_unlock(&worker->lock);
- } else {
- break;
- }
- io_assign_current_work(worker, work);
- __set_current_state(TASK_RUNNING);
-
- /* handle a whole dependent link */
- do {
- struct io_wq_work *next_hashed, *linked;
- unsigned int hash = io_get_work_hash(work);
-
- next_hashed = wq_next_work(work);
-
- if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
- work->flags |= IO_WQ_WORK_CANCEL;
- wq->do_work(work);
- io_assign_current_work(worker, NULL);
-
- linked = wq->free_work(work);
- work = next_hashed;
- if (!work && linked && !io_wq_is_hashed(linked)) {
- work = linked;
- linked = NULL;
- }
- io_assign_current_work(worker, work);
- if (linked)
- io_wqe_enqueue(wqe, linked);
-
- if (hash != -1U && !next_hashed) {
- /* serialize hash clear with wake_up() */
- spin_lock_irq(&wq->hash->wait.lock);
- clear_bit(hash, &wq->hash->map);
- clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- spin_unlock_irq(&wq->hash->wait.lock);
- if (wq_has_sleeper(&wq->hash->wait))
- wake_up(&wq->hash->wait);
- }
- } while (work);
- } while (1);
-}
-
-static int io_wqe_worker(void *data)
-{
- struct io_worker *worker = data;
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
- bool last_timeout = false;
- char buf[TASK_COMM_LEN];
-
- worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
-
- snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
- set_task_comm(current, buf);
-
- audit_alloc_kernel(current);
-
- while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- long ret;
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (io_acct_run_queue(acct))
- io_worker_handle_work(worker);
-
- raw_spin_lock(&wqe->lock);
- /* timed out, exit unless we're the last worker */
- if (last_timeout && acct->nr_workers > 1) {
- acct->nr_workers--;
- raw_spin_unlock(&wqe->lock);
- __set_current_state(TASK_RUNNING);
- break;
- }
- last_timeout = false;
- __io_worker_idle(wqe, worker);
- raw_spin_unlock(&wqe->lock);
- if (io_flush_signals())
- continue;
- ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
- if (signal_pending(current)) {
- struct ksignal ksig;
-
- if (!get_signal(&ksig))
- continue;
- break;
- }
- last_timeout = !ret;
- }
-
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
- io_worker_handle_work(worker);
-
- audit_free(current);
- io_worker_exit(worker);
- return 0;
-}
-
-/*
- * Called when a worker is scheduled in. Mark us as currently running.
- */
-void io_wq_worker_running(struct task_struct *tsk)
-{
- struct io_worker *worker = tsk->worker_private;
-
- if (!worker)
- return;
- if (!(worker->flags & IO_WORKER_F_UP))
- return;
- if (worker->flags & IO_WORKER_F_RUNNING)
- return;
- worker->flags |= IO_WORKER_F_RUNNING;
- io_wqe_inc_running(worker);
-}
-
-/*
- * Called when worker is going to sleep. If there are no workers currently
- * running and we have work pending, wake up a free one or create a new one.
- */
-void io_wq_worker_sleeping(struct task_struct *tsk)
-{
- struct io_worker *worker = tsk->worker_private;
-
- if (!worker)
- return;
- if (!(worker->flags & IO_WORKER_F_UP))
- return;
- if (!(worker->flags & IO_WORKER_F_RUNNING))
- return;
-
- worker->flags &= ~IO_WORKER_F_RUNNING;
- io_wqe_dec_running(worker);
-}
-
-static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
- struct task_struct *tsk)
-{
- tsk->worker_private = worker;
- worker->task = tsk;
- set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
- tsk->flags |= PF_NO_SETAFFINITY;
-
- raw_spin_lock(&wqe->lock);
- hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
- list_add_tail_rcu(&worker->all_list, &wqe->all_list);
- worker->flags |= IO_WORKER_F_FREE;
- raw_spin_unlock(&wqe->lock);
- wake_up_new_task(tsk);
-}
-
-static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
-{
- return true;
-}
-
-static inline bool io_should_retry_thread(long err)
-{
- /*
- * Prevent perpetual task_work retry, if the task (or its group) is
- * exiting.
- */
- if (fatal_signal_pending(current))
- return false;
-
- switch (err) {
- case -EAGAIN:
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- case -ERESTARTNOHAND:
- return true;
- default:
- return false;
- }
-}
-
-static void create_worker_cont(struct callback_head *cb)
-{
- struct io_worker *worker;
- struct task_struct *tsk;
- struct io_wqe *wqe;
-
- worker = container_of(cb, struct io_worker, create_work);
- clear_bit_unlock(0, &worker->create_state);
- wqe = worker->wqe;
- tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
- if (!IS_ERR(tsk)) {
- io_init_new_worker(wqe, worker, tsk);
- io_worker_release(worker);
- return;
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&wqe->lock);
- acct->nr_workers--;
- if (!acct->nr_workers) {
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_all,
- .cancel_all = true,
- };
-
- raw_spin_unlock(&wqe->lock);
- while (io_acct_cancel_pending_work(wqe, acct, &match))
- ;
- } else {
- raw_spin_unlock(&wqe->lock);
- }
- io_worker_ref_put(wqe->wq);
- kfree(worker);
- return;
- }
-
- /* re-create attempts grab a new worker ref, drop the existing one */
- io_worker_release(worker);
- schedule_work(&worker->work);
-}
-
-static void io_workqueue_create(struct work_struct *work)
-{
- struct io_worker *worker = container_of(work, struct io_worker, work);
- struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-
- if (!io_queue_worker_create(worker, acct, create_worker_cont))
- kfree(worker);
-}
-
-static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
-{
- struct io_wqe_acct *acct = &wqe->acct[index];
- struct io_worker *worker;
- struct task_struct *tsk;
-
- __set_current_state(TASK_RUNNING);
-
- worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
- if (!worker) {
-fail:
- atomic_dec(&acct->nr_running);
- raw_spin_lock(&wqe->lock);
- acct->nr_workers--;
- raw_spin_unlock(&wqe->lock);
- io_worker_ref_put(wq);
- return false;
- }
-
- refcount_set(&worker->ref, 1);
- worker->wqe = wqe;
- raw_spin_lock_init(&worker->lock);
- init_completion(&worker->ref_done);
-
- if (index == IO_WQ_ACCT_BOUND)
- worker->flags |= IO_WORKER_F_BOUND;
-
- tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
- if (!IS_ERR(tsk)) {
- io_init_new_worker(wqe, worker, tsk);
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
- kfree(worker);
- goto fail;
- } else {
- INIT_WORK(&worker->work, io_workqueue_create);
- schedule_work(&worker->work);
- }
-
- return true;
-}
-
-/*
- * Iterate the passed in list and call the specific function for each
- * worker that isn't exiting
- */
-static bool io_wq_for_each_worker(struct io_wqe *wqe,
- bool (*func)(struct io_worker *, void *),
- void *data)
-{
- struct io_worker *worker;
- bool ret = false;
-
- list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
- if (io_worker_get(worker)) {
- /* no task if node is/was offline */
- if (worker->task)
- ret = func(worker, data);
- io_worker_release(worker);
- if (ret)
- break;
- }
- }
-
- return ret;
-}
-
-static bool io_wq_worker_wake(struct io_worker *worker, void *data)
-{
- set_notify_signal(worker->task);
- wake_up_process(worker->task);
- return false;
-}
-
-static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
-{
- struct io_wq *wq = wqe->wq;
-
- do {
- work->flags |= IO_WQ_WORK_CANCEL;
- wq->do_work(work);
- work = wq->free_work(work);
- } while (work);
-}
-
-static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
-{
- struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- unsigned int hash;
- struct io_wq_work *tail;
-
- if (!io_wq_is_hashed(work)) {
-append:
- wq_list_add_tail(&work->list, &acct->work_list);
- return;
- }
-
- hash = io_get_work_hash(work);
- tail = wqe->hash_tail[hash];
- wqe->hash_tail[hash] = work;
- if (!tail)
- goto append;
-
- wq_list_add_after(&work->list, &tail->list, &acct->work_list);
-}
-
-static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
-{
- return work == data;
-}
-
-static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
-{
- struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- struct io_cb_cancel_data match;
- unsigned work_flags = work->flags;
- bool do_create;
-
- /*
- * If io-wq is exiting for this task, or if the request has explicitly
- * been marked as one that should not get executed, cancel it here.
- */
- if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
- (work->flags & IO_WQ_WORK_CANCEL)) {
- io_run_cancel(work, wqe);
- return;
- }
-
- raw_spin_lock(&acct->lock);
- io_wqe_insert_work(wqe, work);
- clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&acct->lock);
-
- raw_spin_lock(&wqe->lock);
- rcu_read_lock();
- do_create = !io_wqe_activate_free_worker(wqe, acct);
- rcu_read_unlock();
-
- raw_spin_unlock(&wqe->lock);
-
- if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
- !atomic_read(&acct->nr_running))) {
- bool did_create;
-
- did_create = io_wqe_create_worker(wqe, acct);
- if (likely(did_create))
- return;
-
- raw_spin_lock(&wqe->lock);
- if (acct->nr_workers) {
- raw_spin_unlock(&wqe->lock);
- return;
- }
- raw_spin_unlock(&wqe->lock);
-
- /* fatal condition, failed to create the first worker */
- match.fn = io_wq_work_match_item,
- match.data = work,
- match.cancel_all = false,
-
- io_acct_cancel_pending_work(wqe, acct, &match);
- }
-}
-
-void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
-{
- struct io_wqe *wqe = wq->wqes[numa_node_id()];
-
- io_wqe_enqueue(wqe, work);
-}
-
-/*
- * Work items that hash to the same value will not be done in parallel.
- * Used to limit concurrent writes, generally hashed by inode.
- */
-void io_wq_hash_work(struct io_wq_work *work, void *val)
-{
- unsigned int bit;
-
- bit = hash_ptr(val, IO_WQ_HASH_ORDER);
- work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
-}
-
-static bool __io_wq_worker_cancel(struct io_worker *worker,
- struct io_cb_cancel_data *match,
- struct io_wq_work *work)
-{
- if (work && match->fn(work, match->data)) {
- work->flags |= IO_WQ_WORK_CANCEL;
- set_notify_signal(worker->task);
- return true;
- }
-
- return false;
-}
-
-static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
-{
- struct io_cb_cancel_data *match = data;
-
- /*
- * Hold the lock to avoid ->cur_work going out of scope, caller
- * may dereference the passed in work.
- */
- raw_spin_lock(&worker->lock);
- if (__io_wq_worker_cancel(worker, match, worker->cur_work) ||
- __io_wq_worker_cancel(worker, match, worker->next_work))
- match->nr_running++;
- raw_spin_unlock(&worker->lock);
-
- return match->nr_running && !match->cancel_all;
-}
-
-static inline void io_wqe_remove_pending(struct io_wqe *wqe,
- struct io_wq_work *work,
- struct io_wq_work_node *prev)
-{
- struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- unsigned int hash = io_get_work_hash(work);
- struct io_wq_work *prev_work = NULL;
-
- if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
- if (prev)
- prev_work = container_of(prev, struct io_wq_work, list);
- if (prev_work && io_get_work_hash(prev_work) == hash)
- wqe->hash_tail[hash] = prev_work;
- else
- wqe->hash_tail[hash] = NULL;
- }
- wq_list_del(&acct->work_list, &work->list, prev);
-}
-
-static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
- struct io_wqe_acct *acct,
- struct io_cb_cancel_data *match)
-{
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
-
- raw_spin_lock(&acct->lock);
- wq_list_for_each(node, prev, &acct->work_list) {
- work = container_of(node, struct io_wq_work, list);
- if (!match->fn(work, match->data))
- continue;
- io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock(&acct->lock);
- io_run_cancel(work, wqe);
- match->nr_pending++;
- /* not safe to continue after unlock */
- return true;
- }
- raw_spin_unlock(&acct->lock);
-
- return false;
-}
-
-static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
-{
- int i;
-retry:
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
-
- if (io_acct_cancel_pending_work(wqe, acct, match)) {
- if (match->cancel_all)
- goto retry;
- break;
- }
- }
-}
-
-static void io_wqe_cancel_running_work(struct io_wqe *wqe,
- struct io_cb_cancel_data *match)
-{
- rcu_read_lock();
- io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
- rcu_read_unlock();
-}
-
-enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data, bool cancel_all)
-{
- struct io_cb_cancel_data match = {
- .fn = cancel,
- .data = data,
- .cancel_all = cancel_all,
- };
- int node;
-
- /*
- * First check pending list, if we're lucky we can just remove it
- * from there. CANCEL_OK means that the work is returned as-new,
- * no completion will be posted for it.
- *
- * Then check if a free (going busy) or busy worker has the work
- * currently running. If we find it there, we'll return CANCEL_RUNNING
- * as an indication that we attempt to signal cancellation. The
- * completion will run normally in this case.
- *
- * Do both of these while holding the wqe->lock, to ensure that
- * we'll find a work item regardless of state.
- */
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- io_wqe_cancel_pending_work(wqe, &match);
- if (match.nr_pending && !match.cancel_all)
- return IO_WQ_CANCEL_OK;
-
- raw_spin_lock(&wqe->lock);
- io_wqe_cancel_running_work(wqe, &match);
- raw_spin_unlock(&wqe->lock);
- if (match.nr_running && !match.cancel_all)
- return IO_WQ_CANCEL_RUNNING;
- }
-
- if (match.nr_running)
- return IO_WQ_CANCEL_RUNNING;
- if (match.nr_pending)
- return IO_WQ_CANCEL_OK;
- return IO_WQ_CANCEL_NOTFOUND;
-}
-
-static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
- int sync, void *key)
-{
- struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
- int i;
-
- list_del_init(&wait->entry);
-
- rcu_read_lock();
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- struct io_wqe_acct *acct = &wqe->acct[i];
-
- if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- io_wqe_activate_free_worker(wqe, acct);
- }
- rcu_read_unlock();
- return 1;
-}
-
-struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
-{
- int ret, node, i;
- struct io_wq *wq;
-
- if (WARN_ON_ONCE(!data->free_work || !data->do_work))
- return ERR_PTR(-EINVAL);
- if (WARN_ON_ONCE(!bounded))
- return ERR_PTR(-EINVAL);
-
- wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
- if (!wq)
- return ERR_PTR(-ENOMEM);
- ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- if (ret)
- goto err_wq;
-
- refcount_inc(&data->hash->refs);
- wq->hash = data->hash;
- wq->free_work = data->free_work;
- wq->do_work = data->do_work;
-
- ret = -ENOMEM;
- for_each_node(node) {
- struct io_wqe *wqe;
- int alloc_node = node;
-
- if (!node_online(alloc_node))
- alloc_node = NUMA_NO_NODE;
- wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
- if (!wqe)
- goto err;
- if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
- goto err;
- cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
- wq->wqes[node] = wqe;
- wqe->node = alloc_node;
- wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
- wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
- task_rlimit(current, RLIMIT_NPROC);
- INIT_LIST_HEAD(&wqe->wait.entry);
- wqe->wait.func = io_wqe_hash_wake;
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- struct io_wqe_acct *acct = &wqe->acct[i];
-
- acct->index = i;
- atomic_set(&acct->nr_running, 0);
- INIT_WQ_LIST(&acct->work_list);
- raw_spin_lock_init(&acct->lock);
- }
- wqe->wq = wq;
- raw_spin_lock_init(&wqe->lock);
- INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
- INIT_LIST_HEAD(&wqe->all_list);
- }
-
- wq->task = get_task_struct(data->task);
- atomic_set(&wq->worker_refs, 1);
- init_completion(&wq->worker_done);
- return wq;
-err:
- io_wq_put_hash(data->hash);
- cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- for_each_node(node) {
- if (!wq->wqes[node])
- continue;
- free_cpumask_var(wq->wqes[node]->cpu_mask);
- kfree(wq->wqes[node]);
- }
-err_wq:
- kfree(wq);
- return ERR_PTR(ret);
-}
-
-static bool io_task_work_match(struct callback_head *cb, void *data)
-{
- struct io_worker *worker;
-
- if (cb->func != create_worker_cb && cb->func != create_worker_cont)
- return false;
- worker = container_of(cb, struct io_worker, create_work);
- return worker->wqe->wq == data;
-}
-
-void io_wq_exit_start(struct io_wq *wq)
-{
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
-}
-
-static void io_wq_cancel_tw_create(struct io_wq *wq)
-{
- struct callback_head *cb;
-
- while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
- struct io_worker *worker;
-
- worker = container_of(cb, struct io_worker, create_work);
- io_worker_cancel_cb(worker);
- }
-}
-
-static void io_wq_exit_workers(struct io_wq *wq)
-{
- int node;
-
- if (!wq->task)
- return;
-
- io_wq_cancel_tw_create(wq);
-
- rcu_read_lock();
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
- }
- rcu_read_unlock();
- io_worker_ref_put(wq);
- wait_for_completion(&wq->worker_done);
-
- for_each_node(node) {
- spin_lock_irq(&wq->hash->wait.lock);
- list_del_init(&wq->wqes[node]->wait.entry);
- spin_unlock_irq(&wq->hash->wait.lock);
- }
- put_task_struct(wq->task);
- wq->task = NULL;
-}
-
-static void io_wq_destroy(struct io_wq *wq)
-{
- int node;
-
- cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_all,
- .cancel_all = true,
- };
- io_wqe_cancel_pending_work(wqe, &match);
- free_cpumask_var(wqe->cpu_mask);
- kfree(wqe);
- }
- io_wq_put_hash(wq->hash);
- kfree(wq);
-}
-
-void io_wq_put_and_exit(struct io_wq *wq)
-{
- WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
-
- io_wq_exit_workers(wq);
- io_wq_destroy(wq);
-}
-
-struct online_data {
- unsigned int cpu;
- bool online;
-};
-
-static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
-{
- struct online_data *od = data;
-
- if (od->online)
- cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
- else
- cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
- return false;
-}
-
-static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
-{
- struct online_data od = {
- .cpu = cpu,
- .online = online
- };
- int i;
-
- rcu_read_lock();
- for_each_node(i)
- io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
- rcu_read_unlock();
- return 0;
-}
-
-static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
-{
- struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
-
- return __io_wq_cpu_online(wq, cpu, true);
-}
-
-static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
-{
- struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
-
- return __io_wq_cpu_online(wq, cpu, false);
-}
-
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
-{
- int i;
-
- rcu_read_lock();
- for_each_node(i) {
- struct io_wqe *wqe = wq->wqes[i];
-
- if (mask)
- cpumask_copy(wqe->cpu_mask, mask);
- else
- cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
- }
- rcu_read_unlock();
- return 0;
-}
-
-/*
- * Set max number of unbounded workers, returns old value. If new_count is 0,
- * then just return the old value.
- */
-int io_wq_max_workers(struct io_wq *wq, int *new_count)
-{
- int prev[IO_WQ_ACCT_NR];
- bool first_node = true;
- int i, node;
-
- BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND);
- BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
- BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
-
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
- new_count[i] = task_rlimit(current, RLIMIT_NPROC);
- }
-
- for (i = 0; i < IO_WQ_ACCT_NR; i++)
- prev[i] = 0;
-
- rcu_read_lock();
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
- struct io_wqe_acct *acct;
-
- raw_spin_lock(&wqe->lock);
- for (i = 0; i < IO_WQ_ACCT_NR; i++) {
- acct = &wqe->acct[i];
- if (first_node)
- prev[i] = max_t(int, acct->max_workers, prev[i]);
- if (new_count[i])
- acct->max_workers = new_count[i];
- }
- raw_spin_unlock(&wqe->lock);
- first_node = false;
- }
- rcu_read_unlock();
-
- for (i = 0; i < IO_WQ_ACCT_NR; i++)
- new_count[i] = prev[i];
-
- return 0;
-}
-
-static __init int io_wq_init(void)
-{
- int ret;
-
- ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
- io_wq_cpu_online, io_wq_cpu_offline);
- if (ret < 0)
- return ret;
- io_wq_online = ret;
- return 0;
-}
-subsys_initcall(io_wq_init);
diff --git a/fs/io-wq.h b/fs/io-wq.h
deleted file mode 100644
index dbecd27656c7..000000000000
--- a/fs/io-wq.h
+++ /dev/null
@@ -1,227 +0,0 @@
-#ifndef INTERNAL_IO_WQ_H
-#define INTERNAL_IO_WQ_H
-
-#include <linux/refcount.h>
-
-struct io_wq;
-
-enum {
- IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HASHED = 2,
- IO_WQ_WORK_UNBOUND = 4,
- IO_WQ_WORK_CONCURRENT = 16,
-
- IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
-};
-
-enum io_wq_cancel {
- IO_WQ_CANCEL_OK, /* cancelled before started */
- IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
- IO_WQ_CANCEL_NOTFOUND, /* work not found */
-};
-
-struct io_wq_work_node {
- struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
- struct io_wq_work_node *first;
- struct io_wq_work_node *last;
-};
-
-#define wq_list_for_each(pos, prv, head) \
- for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
-
-#define wq_list_for_each_resume(pos, prv) \
- for (; pos; prv = pos, pos = (pos)->next)
-
-#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
-#define INIT_WQ_LIST(list) do { \
- (list)->first = NULL; \
-} while (0)
-
-static inline void wq_list_add_after(struct io_wq_work_node *node,
- struct io_wq_work_node *pos,
- struct io_wq_work_list *list)
-{
- struct io_wq_work_node *next = pos->next;
-
- pos->next = node;
- node->next = next;
- if (!next)
- list->last = node;
-}
-
-/**
- * wq_list_merge - merge the second list to the first one.
- * @list0: the first list
- * @list1: the second list
- * Return the first node after mergence.
- */
-static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
- struct io_wq_work_list *list1)
-{
- struct io_wq_work_node *ret;
-
- if (!list0->first) {
- ret = list1->first;
- } else {
- ret = list0->first;
- list0->last->next = list1->first;
- }
- INIT_WQ_LIST(list0);
- INIT_WQ_LIST(list1);
- return ret;
-}
-
-static inline void wq_list_add_tail(struct io_wq_work_node *node,
- struct io_wq_work_list *list)
-{
- node->next = NULL;
- if (!list->first) {
- list->last = node;
- WRITE_ONCE(list->first, node);
- } else {
- list->last->next = node;
- list->last = node;
- }
-}
-
-static inline void wq_list_add_head(struct io_wq_work_node *node,
- struct io_wq_work_list *list)
-{
- node->next = list->first;
- if (!node->next)
- list->last = node;
- WRITE_ONCE(list->first, node);
-}
-
-static inline void wq_list_cut(struct io_wq_work_list *list,
- struct io_wq_work_node *last,
- struct io_wq_work_node *prev)
-{
- /* first in the list, if prev==NULL */
- if (!prev)
- WRITE_ONCE(list->first, last->next);
- else
- prev->next = last->next;
-
- if (last == list->last)
- list->last = prev;
- last->next = NULL;
-}
-
-static inline void __wq_list_splice(struct io_wq_work_list *list,
- struct io_wq_work_node *to)
-{
- list->last->next = to->next;
- to->next = list->first;
- INIT_WQ_LIST(list);
-}
-
-static inline bool wq_list_splice(struct io_wq_work_list *list,
- struct io_wq_work_node *to)
-{
- if (!wq_list_empty(list)) {
- __wq_list_splice(list, to);
- return true;
- }
- return false;
-}
-
-static inline void wq_stack_add_head(struct io_wq_work_node *node,
- struct io_wq_work_node *stack)
-{
- node->next = stack->next;
- stack->next = node;
-}
-
-static inline void wq_list_del(struct io_wq_work_list *list,
- struct io_wq_work_node *node,
- struct io_wq_work_node *prev)
-{
- wq_list_cut(list, node, prev);
-}
-
-static inline
-struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
-{
- struct io_wq_work_node *node = stack->next;
-
- stack->next = node->next;
- return node;
-}
-
-struct io_wq_work {
- struct io_wq_work_node list;
- unsigned flags;
-};
-
-static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
-{
- if (!work->list.next)
- return NULL;
-
- return container_of(work->list.next, struct io_wq_work, list);
-}
-
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work *);
-
-struct io_wq_hash {
- refcount_t refs;
- unsigned long map;
- struct wait_queue_head wait;
-};
-
-static inline void io_wq_put_hash(struct io_wq_hash *hash)
-{
- if (refcount_dec_and_test(&hash->refs))
- kfree(hash);
-}
-
-struct io_wq_data {
- struct io_wq_hash *hash;
- struct task_struct *task;
- io_wq_work_fn *do_work;
- free_work_fn *free_work;
-};
-
-struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
-void io_wq_exit_start(struct io_wq *wq);
-void io_wq_put_and_exit(struct io_wq *wq);
-
-void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
-void io_wq_hash_work(struct io_wq_work *work, void *val);
-
-int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
-int io_wq_max_workers(struct io_wq *wq, int *new_count);
-
-static inline bool io_wq_is_hashed(struct io_wq_work *work)
-{
- return work->flags & IO_WQ_WORK_HASHED;
-}
-
-typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
-
-enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data, bool cancel_all);
-
-#if defined(CONFIG_IO_WQ)
-extern void io_wq_worker_sleeping(struct task_struct *);
-extern void io_wq_worker_running(struct task_struct *);
-#else
-static inline void io_wq_worker_sleeping(struct task_struct *tsk)
-{
-}
-static inline void io_wq_worker_running(struct task_struct *tsk)
-{
-}
-#endif
-
-static inline bool io_wq_current_is_worker(void)
-{
- return in_task() && (current->flags & PF_IO_WORKER) &&
- current->worker_private;
-}
-#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
deleted file mode 100644
index a8413f006417..000000000000
--- a/fs/io_uring.c
+++ /dev/null
@@ -1,12038 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Shared application/kernel submission and completion ring pairs, for
- * supporting fast/efficient IO.
- *
- * A note on the read/write ordering memory barriers that are matched between
- * the application and kernel side.
- *
- * After the application reads the CQ ring tail, it must use an
- * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
- * before writing the tail (using smp_load_acquire to read the tail will
- * do). It also needs a smp_mb() before updating CQ head (ordering the
- * entry load(s) with the head store), pairing with an implicit barrier
- * through a control-dependency in io_get_cqe (smp_store_release to
- * store head will do). Failure to do so could lead to reading invalid
- * CQ entries.
- *
- * Likewise, the application must use an appropriate smp_wmb() before
- * writing the SQ tail (ordering SQ entry stores with the tail store),
- * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
- * to store the tail will do). And it needs a barrier ordering the SQ
- * head load before writing new SQ entries (smp_load_acquire to read
- * head will do).
- *
- * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
- * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
- * updating the SQ tail; a full memory barrier smp_mb() is needed
- * between.
- *
- * Also see the examples in the liburing library:
- *
- * git://git.kernel.dk/liburing
- *
- * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
- * from data shared between the kernel and application. This is done both
- * for ordering purposes, but also to ensure that once a value is loaded from
- * data that the application could potentially modify, it remains stable.
- *
- * Copyright (C) 2018-2019 Jens Axboe
- * Copyright (c) 2018-2019 Christoph Hellwig
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/syscalls.h>
-#include <linux/compat.h>
-#include <net/compat.h>
-#include <linux/refcount.h>
-#include <linux/uio.h>
-#include <linux/bits.h>
-
-#include <linux/sched/signal.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/fdtable.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/blk-mq.h>
-#include <linux/bvec.h>
-#include <linux/net.h>
-#include <net/sock.h>
-#include <net/af_unix.h>
-#include <net/scm.h>
-#include <net/busy_poll.h>
-#include <linux/anon_inodes.h>
-#include <linux/sched/mm.h>
-#include <linux/uaccess.h>
-#include <linux/nospec.h>
-#include <linux/sizes.h>
-#include <linux/hugetlb.h>
-#include <linux/highmem.h>
-#include <linux/namei.h>
-#include <linux/fsnotify.h>
-#include <linux/fadvise.h>
-#include <linux/eventpoll.h>
-#include <linux/splice.h>
-#include <linux/task_work.h>
-#include <linux/pagemap.h>
-#include <linux/io_uring.h>
-#include <linux/audit.h>
-#include <linux/security.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/io_uring.h>
-
-#include <uapi/linux/io_uring.h>
-
-#include "internal.h"
-#include "io-wq.h"
-
-#define IORING_MAX_ENTRIES 32768
-#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
-
-/* only define max */
-#define IORING_MAX_FIXED_FILES (1U << 15)
-#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
- IORING_REGISTER_LAST + IORING_OP_LAST)
-
-#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
-#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
-#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
-
-#define IORING_MAX_REG_BUFFERS (1U << 14)
-
-#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC)
-
-#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
- IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
-
-#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
- REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
- REQ_F_ASYNC_DATA)
-
-#define IO_TCTX_REFS_CACHE_NR (1U << 10)
-
-struct io_uring {
- u32 head ____cacheline_aligned_in_smp;
- u32 tail ____cacheline_aligned_in_smp;
-};
-
-/*
- * This data is shared with the application through the mmap at offsets
- * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_sqring_offsets when calling io_uring_setup.
- */
-struct io_rings {
- /*
- * Head and tail offsets into the ring; the offsets need to be
- * masked to get valid indices.
- *
- * The kernel controls head of the sq ring and the tail of the cq ring,
- * and the application controls tail of the sq ring and the head of the
- * cq ring.
- */
- struct io_uring sq, cq;
- /*
- * Bitmasks to apply to head and tail offsets (constant, equals
- * ring_entries - 1)
- */
- u32 sq_ring_mask, cq_ring_mask;
- /* Ring sizes (constant, power of 2) */
- u32 sq_ring_entries, cq_ring_entries;
- /*
- * Number of invalid entries dropped by the kernel due to
- * invalid index stored in array
- *
- * Written by the kernel, shouldn't be modified by the
- * application (i.e. get number of "new events" by comparing to
- * cached value).
- *
- * After a new SQ head value was read by the application this
- * counter includes all submissions that were dropped reaching
- * the new SQ head (and possibly more).
- */
- u32 sq_dropped;
- /*
- * Runtime SQ flags
- *
- * Written by the kernel, shouldn't be modified by the
- * application.
- *
- * The application needs a full memory barrier before checking
- * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
- */
- u32 sq_flags;
- /*
- * Runtime CQ flags
- *
- * Written by the application, shouldn't be modified by the
- * kernel.
- */
- u32 cq_flags;
- /*
- * Number of completion events lost because the queue was full;
- * this should be avoided by the application by making sure
- * there are not more requests pending than there is space in
- * the completion queue.
- *
- * Written by the kernel, shouldn't be modified by the
- * application (i.e. get number of "new events" by comparing to
- * cached value).
- *
- * As completion events come in out of order this counter is not
- * ordered with any other data.
- */
- u32 cq_overflow;
- /*
- * Ring buffer of completion events.
- *
- * The kernel writes completion events fresh every time they are
- * produced, so the application is allowed to modify pending
- * entries.
- */
- struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
-};
-
-enum io_uring_cmd_flags {
- IO_URING_F_COMPLETE_DEFER = 1,
- IO_URING_F_UNLOCKED = 2,
- /* int's last bit, sign checks are usually faster than a bit test */
- IO_URING_F_NONBLOCK = INT_MIN,
-};
-
-struct io_mapped_ubuf {
- u64 ubuf;
- u64 ubuf_end;
- unsigned int nr_bvecs;
- unsigned long acct_pages;
- struct bio_vec bvec[];
-};
-
-struct io_ring_ctx;
-
-struct io_overflow_cqe {
- struct io_uring_cqe cqe;
- struct list_head list;
-};
-
-struct io_fixed_file {
- /* file * with additional FFS_* flags */
- unsigned long file_ptr;
-};
-
-struct io_rsrc_put {
- struct list_head list;
- u64 tag;
- union {
- void *rsrc;
- struct file *file;
- struct io_mapped_ubuf *buf;
- };
-};
-
-struct io_file_table {
- struct io_fixed_file *files;
-};
-
-struct io_rsrc_node {
- struct percpu_ref refs;
- struct list_head node;
- struct list_head rsrc_list;
- struct io_rsrc_data *rsrc_data;
- struct llist_node llist;
- bool done;
-};
-
-typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-
-struct io_rsrc_data {
- struct io_ring_ctx *ctx;
-
- u64 **tags;
- unsigned int nr;
- rsrc_put_fn *do_put;
- atomic_t refs;
- struct completion done;
- bool quiesce;
-};
-
-struct io_buffer_list {
- struct list_head list;
- struct list_head buf_list;
- __u16 bgid;
-};
-
-struct io_buffer {
- struct list_head list;
- __u64 addr;
- __u32 len;
- __u16 bid;
- __u16 bgid;
-};
-
-struct io_restriction {
- DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
- DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
- u8 sqe_flags_allowed;
- u8 sqe_flags_required;
- bool registered;
-};
-
-enum {
- IO_SQ_THREAD_SHOULD_STOP = 0,
- IO_SQ_THREAD_SHOULD_PARK,
-};
-
-struct io_sq_data {
- refcount_t refs;
- atomic_t park_pending;
- struct mutex lock;
-
- /* ctx's that are using this sqd */
- struct list_head ctx_list;
-
- struct task_struct *thread;
- struct wait_queue_head wait;
-
- unsigned sq_thread_idle;
- int sq_cpu;
- pid_t task_pid;
- pid_t task_tgid;
-
- unsigned long state;
- struct completion exited;
-};
-
-#define IO_COMPL_BATCH 32
-#define IO_REQ_CACHE_SIZE 32
-#define IO_REQ_ALLOC_BATCH 8
-
-struct io_submit_link {
- struct io_kiocb *head;
- struct io_kiocb *last;
-};
-
-struct io_submit_state {
- /* inline/task_work completion list, under ->uring_lock */
- struct io_wq_work_node free_list;
- /* batch completion logic */
- struct io_wq_work_list compl_reqs;
- struct io_submit_link link;
-
- bool plug_started;
- bool need_plug;
- bool flush_cqes;
- unsigned short submit_nr;
- struct blk_plug plug;
-};
-
-struct io_ev_fd {
- struct eventfd_ctx *cq_ev_fd;
- unsigned int eventfd_async: 1;
- struct rcu_head rcu;
-};
-
-#define IO_BUFFERS_HASH_BITS 5
-
-struct io_ring_ctx {
- /* const or read-mostly hot data */
- struct {
- struct percpu_ref refs;
-
- struct io_rings *rings;
- unsigned int flags;
- unsigned int compat: 1;
- unsigned int drain_next: 1;
- unsigned int restricted: 1;
- unsigned int off_timeout_used: 1;
- unsigned int drain_active: 1;
- unsigned int drain_disabled: 1;
- unsigned int has_evfd: 1;
- } ____cacheline_aligned_in_smp;
-
- /* submission data */
- struct {
- struct mutex uring_lock;
-
- /*
- * Ring buffer of indices into array of io_uring_sqe, which is
- * mmapped by the application using the IORING_OFF_SQES offset.
- *
- * This indirection could e.g. be used to assign fixed
- * io_uring_sqe entries to operations and only submit them to
- * the queue when needed.
- *
- * The kernel modifies neither the indices array nor the entries
- * array.
- */
- u32 *sq_array;
- struct io_uring_sqe *sq_sqes;
- unsigned cached_sq_head;
- unsigned sq_entries;
- struct list_head defer_list;
-
- /*
- * Fixed resources fast path, should be accessed only under
- * uring_lock, and updated through io_uring_register(2)
- */
- struct io_rsrc_node *rsrc_node;
- int rsrc_cached_refs;
- struct io_file_table file_table;
- unsigned nr_user_files;
- unsigned nr_user_bufs;
- struct io_mapped_ubuf **user_bufs;
-
- struct io_submit_state submit_state;
- struct list_head timeout_list;
- struct list_head ltimeout_list;
- struct list_head cq_overflow_list;
- struct list_head *io_buffers;
- struct list_head io_buffers_cache;
- struct list_head apoll_cache;
- struct xarray personalities;
- u32 pers_next;
- unsigned sq_thread_idle;
- } ____cacheline_aligned_in_smp;
-
- /* IRQ completion list, under ->completion_lock */
- struct io_wq_work_list locked_free_list;
- unsigned int locked_free_nr;
-
- const struct cred *sq_creds; /* cred used for __io_sq_thread() */
- struct io_sq_data *sq_data; /* if using sq thread polling */
-
- struct wait_queue_head sqo_sq_wait;
- struct list_head sqd_list;
-
- unsigned long check_cq_overflow;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- /* used to track busy poll napi_id */
- struct list_head napi_list;
- spinlock_t napi_lock; /* napi_list lock */
-#endif
-
- struct {
- unsigned cached_cq_tail;
- unsigned cq_entries;
- struct io_ev_fd __rcu *io_ev_fd;
- struct wait_queue_head cq_wait;
- unsigned cq_extra;
- atomic_t cq_timeouts;
- unsigned cq_last_tm_flush;
- } ____cacheline_aligned_in_smp;
-
- struct {
- spinlock_t completion_lock;
-
- spinlock_t timeout_lock;
-
- /*
- * ->iopoll_list is protected by the ctx->uring_lock for
- * io_uring instances that don't use IORING_SETUP_SQPOLL.
- * For SQPOLL, only the single threaded io_sq_thread() will
- * manipulate the list, hence no extra locking is needed there.
- */
- struct io_wq_work_list iopoll_list;
- struct hlist_head *cancel_hash;
- unsigned cancel_hash_bits;
- bool poll_multi_queue;
-
- struct list_head io_buffers_comp;
- } ____cacheline_aligned_in_smp;
-
- struct io_restriction restrictions;
-
- /* slow path rsrc auxilary data, used by update/register */
- struct {
- struct io_rsrc_node *rsrc_backup_node;
- struct io_mapped_ubuf *dummy_ubuf;
- struct io_rsrc_data *file_data;
- struct io_rsrc_data *buf_data;
-
- struct delayed_work rsrc_put_work;
- struct llist_head rsrc_put_llist;
- struct list_head rsrc_ref_list;
- spinlock_t rsrc_ref_lock;
-
- struct list_head io_buffers_pages;
- };
-
- /* Keep this last, we don't need it for the fast path */
- struct {
- #if defined(CONFIG_UNIX)
- struct socket *ring_sock;
- #endif
- /* hashed buffered write serialization */
- struct io_wq_hash *hash_map;
-
- /* Only used for accounting purposes */
- struct user_struct *user;
- struct mm_struct *mm_account;
-
- /* ctx exit and cancelation */
- struct llist_head fallback_llist;
- struct delayed_work fallback_work;
- struct work_struct exit_work;
- struct list_head tctx_list;
- struct completion ref_comp;
- u32 iowq_limits[2];
- bool iowq_limits_set;
- };
-};
-
-/*
- * Arbitrary limit, can be raised if need be
- */
-#define IO_RINGFD_REG_MAX 16
-
-struct io_uring_task {
- /* submission side */
- int cached_refs;
- struct xarray xa;
- struct wait_queue_head wait;
- const struct io_ring_ctx *last;
- struct io_wq *io_wq;
- struct percpu_counter inflight;
- atomic_t inflight_tracked;
- atomic_t in_idle;
-
- spinlock_t task_lock;
- struct io_wq_work_list task_list;
- struct io_wq_work_list prior_task_list;
- struct callback_head task_work;
- struct file **registered_rings;
- bool task_running;
-};
-
-/*
- * First field must be the file pointer in all the
- * iocb unions! See also 'struct kiocb' in <linux/fs.h>
- */
-struct io_poll_iocb {
- struct file *file;
- struct wait_queue_head *head;
- __poll_t events;
- struct wait_queue_entry wait;
-};
-
-struct io_poll_update {
- struct file *file;
- u64 old_user_data;
- u64 new_user_data;
- __poll_t events;
- bool update_events;
- bool update_user_data;
-};
-
-struct io_close {
- struct file *file;
- int fd;
- u32 file_slot;
-};
-
-struct io_timeout_data {
- struct io_kiocb *req;
- struct hrtimer timer;
- struct timespec64 ts;
- enum hrtimer_mode mode;
- u32 flags;
-};
-
-struct io_accept {
- struct file *file;
- struct sockaddr __user *addr;
- int __user *addr_len;
- int flags;
- u32 file_slot;
- unsigned long nofile;
-};
-
-struct io_sync {
- struct file *file;
- loff_t len;
- loff_t off;
- int flags;
- int mode;
-};
-
-struct io_cancel {
- struct file *file;
- u64 addr;
-};
-
-struct io_timeout {
- struct file *file;
- u32 off;
- u32 target_seq;
- struct list_head list;
- /* head of the link, used by linked timeouts only */
- struct io_kiocb *head;
- /* for linked completions */
- struct io_kiocb *prev;
-};
-
-struct io_timeout_rem {
- struct file *file;
- u64 addr;
-
- /* timeout update */
- struct timespec64 ts;
- u32 flags;
- bool ltimeout;
-};
-
-struct io_rw {
- /* NOTE: kiocb has the file as the first member, so don't do it here */
- struct kiocb kiocb;
- u64 addr;
- u64 len;
-};
-
-struct io_connect {
- struct file *file;
- struct sockaddr __user *addr;
- int addr_len;
-};
-
-struct io_sr_msg {
- struct file *file;
- union {
- struct compat_msghdr __user *umsg_compat;
- struct user_msghdr __user *umsg;
- void __user *buf;
- };
- int msg_flags;
- int bgid;
- size_t len;
- size_t done_io;
-};
-
-struct io_open {
- struct file *file;
- int dfd;
- u32 file_slot;
- struct filename *filename;
- struct open_how how;
- unsigned long nofile;
-};
-
-struct io_rsrc_update {
- struct file *file;
- u64 arg;
- u32 nr_args;
- u32 offset;
-};
-
-struct io_fadvise {
- struct file *file;
- u64 offset;
- u32 len;
- u32 advice;
-};
-
-struct io_madvise {
- struct file *file;
- u64 addr;
- u32 len;
- u32 advice;
-};
-
-struct io_epoll {
- struct file *file;
- int epfd;
- int op;
- int fd;
- struct epoll_event event;
-};
-
-struct io_splice {
- struct file *file_out;
- struct file *file_in;
- loff_t off_out;
- loff_t off_in;
- u64 len;
- unsigned int flags;
-};
-
-struct io_provide_buf {
- struct file *file;
- __u64 addr;
- __u32 len;
- __u32 bgid;
- __u16 nbufs;
- __u16 bid;
-};
-
-struct io_statx {
- struct file *file;
- int dfd;
- unsigned int mask;
- unsigned int flags;
- struct filename *filename;
- struct statx __user *buffer;
-};
-
-struct io_shutdown {
- struct file *file;
- int how;
-};
-
-struct io_rename {
- struct file *file;
- int old_dfd;
- int new_dfd;
- struct filename *oldpath;
- struct filename *newpath;
- int flags;
-};
-
-struct io_unlink {
- struct file *file;
- int dfd;
- int flags;
- struct filename *filename;
-};
-
-struct io_mkdir {
- struct file *file;
- int dfd;
- umode_t mode;
- struct filename *filename;
-};
-
-struct io_symlink {
- struct file *file;
- int new_dfd;
- struct filename *oldpath;
- struct filename *newpath;
-};
-
-struct io_hardlink {
- struct file *file;
- int old_dfd;
- int new_dfd;
- struct filename *oldpath;
- struct filename *newpath;
- int flags;
-};
-
-struct io_msg {
- struct file *file;
- u64 user_data;
- u32 len;
-};
-
-struct io_async_connect {
- struct sockaddr_storage address;
-};
-
-struct io_async_msghdr {
- struct iovec fast_iov[UIO_FASTIOV];
- /* points to an allocated iov, if NULL we use fast_iov instead */
- struct iovec *free_iov;
- struct sockaddr __user *uaddr;
- struct msghdr msg;
- struct sockaddr_storage addr;
-};
-
-struct io_rw_state {
- struct iov_iter iter;
- struct iov_iter_state iter_state;
- struct iovec fast_iov[UIO_FASTIOV];
-};
-
-struct io_async_rw {
- struct io_rw_state s;
- const struct iovec *free_iovec;
- size_t bytes_done;
- struct wait_page_queue wpq;
-};
-
-enum {
- REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
- REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
- REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
- REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
- REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
- REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
-
- /* first byte is taken by user flags, shift it to not overlap */
- REQ_F_FAIL_BIT = 8,
- REQ_F_INFLIGHT_BIT,
- REQ_F_CUR_POS_BIT,
- REQ_F_NOWAIT_BIT,
- REQ_F_LINK_TIMEOUT_BIT,
- REQ_F_NEED_CLEANUP_BIT,
- REQ_F_POLLED_BIT,
- REQ_F_BUFFER_SELECTED_BIT,
- REQ_F_COMPLETE_INLINE_BIT,
- REQ_F_REISSUE_BIT,
- REQ_F_CREDS_BIT,
- REQ_F_REFCOUNT_BIT,
- REQ_F_ARM_LTIMEOUT_BIT,
- REQ_F_ASYNC_DATA_BIT,
- REQ_F_SKIP_LINK_CQES_BIT,
- REQ_F_SINGLE_POLL_BIT,
- REQ_F_DOUBLE_POLL_BIT,
- REQ_F_PARTIAL_IO_BIT,
- /* keep async read/write and isreg together and in order */
- REQ_F_SUPPORT_NOWAIT_BIT,
- REQ_F_ISREG_BIT,
-
- /* not a real bit, just to check we're not overflowing the space */
- __REQ_F_LAST_BIT,
-};
-
-enum {
- /* ctx owns file */
- REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
- /* drain existing IO first */
- REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
- /* linked sqes */
- REQ_F_LINK = BIT(REQ_F_LINK_BIT),
- /* doesn't sever on completion < 0 */
- REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
- /* IOSQE_ASYNC */
- REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
- /* IOSQE_BUFFER_SELECT */
- REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* IOSQE_CQE_SKIP_SUCCESS */
- REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
-
- /* fail rest of links */
- REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
- /* on inflight list, should be cancelled and waited on exit reliably */
- REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
- /* read/write uses file position */
- REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
- /* must not punt to workers */
- REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
- /* has or had linked timeout */
- REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
- /* needs cleanup */
- REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
- /* already went through poll handler */
- REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
- /* buffer already selected */
- REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
- /* completion is deferred through io_comp_state */
- REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
- /* caller should reissue async */
- REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
- /* supports async reads/writes */
- REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
- /* regular file */
- REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
- /* has creds assigned */
- REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
- /* skip refcounting if not set */
- REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
- /* there is a linked timeout that has to be armed */
- REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
- /* ->async_data allocated */
- REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
- /* don't post CQEs while failing linked requests */
- REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
- /* single poll may be active */
- REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
- /* double poll may active */
- REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
- /* request has already done partial IO */
- REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
-};
-
-struct async_poll {
- struct io_poll_iocb poll;
- struct io_poll_iocb *double_poll;
-};
-
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
-
-struct io_task_work {
- union {
- struct io_wq_work_node node;
- struct llist_node fallback_node;
- };
- io_req_tw_func_t func;
-};
-
-enum {
- IORING_RSRC_FILE = 0,
- IORING_RSRC_BUFFER = 1,
-};
-
-/*
- * NOTE! Each of the iocb union members has the file pointer
- * as the first entry in their struct definition. So you can
- * access the file pointer through any of the sub-structs,
- * or directly as just 'file' in this struct.
- */
-struct io_kiocb {
- union {
- struct file *file;
- struct io_rw rw;
- struct io_poll_iocb poll;
- struct io_poll_update poll_update;
- struct io_accept accept;
- struct io_sync sync;
- struct io_cancel cancel;
- struct io_timeout timeout;
- struct io_timeout_rem timeout_rem;
- struct io_connect connect;
- struct io_sr_msg sr_msg;
- struct io_open open;
- struct io_close close;
- struct io_rsrc_update rsrc_update;
- struct io_fadvise fadvise;
- struct io_madvise madvise;
- struct io_epoll epoll;
- struct io_splice splice;
- struct io_provide_buf pbuf;
- struct io_statx statx;
- struct io_shutdown shutdown;
- struct io_rename rename;
- struct io_unlink unlink;
- struct io_mkdir mkdir;
- struct io_symlink symlink;
- struct io_hardlink hardlink;
- struct io_msg msg;
- };
-
- u8 opcode;
- /* polled IO has completed */
- u8 iopoll_completed;
- u16 buf_index;
- unsigned int flags;
-
- u64 user_data;
- u32 result;
- u32 cflags;
-
- struct io_ring_ctx *ctx;
- struct task_struct *task;
-
- struct percpu_ref *fixed_rsrc_refs;
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
-
- /* used by request caches, completion batching and iopoll */
- struct io_wq_work_node comp_list;
- atomic_t refs;
- atomic_t poll_refs;
- struct io_task_work io_task_work;
- /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
- struct hlist_node hash_node;
- /* internal polling, see IORING_FEAT_FAST_POLL */
- struct async_poll *apoll;
- /* opcode allocated if it needs to store data for async defer */
- void *async_data;
- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
- struct io_buffer *kbuf;
- /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
- struct io_kiocb *link;
- /* custom credentials, valid IFF REQ_F_CREDS is set */
- const struct cred *creds;
- struct io_wq_work work;
-};
-
-struct io_tctx_node {
- struct list_head ctx_node;
- struct task_struct *task;
- struct io_ring_ctx *ctx;
-};
-
-struct io_defer_entry {
- struct list_head list;
- struct io_kiocb *req;
- u32 seq;
-};
-
-struct io_op_def {
- /* needs req->file assigned */
- unsigned needs_file : 1;
- /* should block plug */
- unsigned plug : 1;
- /* hash wq insertion if file is a regular file */
- unsigned hash_reg_file : 1;
- /* unbound wq insertion if file is a non-regular file */
- unsigned unbound_nonreg_file : 1;
- /* set if opcode supports polled "wait" */
- unsigned pollin : 1;
- unsigned pollout : 1;
- unsigned poll_exclusive : 1;
- /* op supports buffer selection */
- unsigned buffer_select : 1;
- /* do prep async if is going to be punted */
- unsigned needs_async_setup : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
- /* skip auditing */
- unsigned audit_skip : 1;
- /* size of async data needed, if any */
- unsigned short async_size;
-};
-
-static const struct io_op_def io_op_defs[] = {
- [IORING_OP_NOP] = {},
- [IORING_OP_READV] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .needs_async_setup = 1,
- .plug = 1,
- .audit_skip = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_WRITEV] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .needs_async_setup = 1,
- .plug = 1,
- .audit_skip = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_FSYNC] = {
- .needs_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_READ_FIXED] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .plug = 1,
- .audit_skip = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_WRITE_FIXED] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .plug = 1,
- .audit_skip = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_POLL_ADD] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_POLL_REMOVE] = {
- .audit_skip = 1,
- },
- [IORING_OP_SYNC_FILE_RANGE] = {
- .needs_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_SENDMSG] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .needs_async_setup = 1,
- .async_size = sizeof(struct io_async_msghdr),
- },
- [IORING_OP_RECVMSG] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .needs_async_setup = 1,
- .async_size = sizeof(struct io_async_msghdr),
- },
- [IORING_OP_TIMEOUT] = {
- .audit_skip = 1,
- .async_size = sizeof(struct io_timeout_data),
- },
- [IORING_OP_TIMEOUT_REMOVE] = {
- /* used by timeout updates' prep() */
- .audit_skip = 1,
- },
- [IORING_OP_ACCEPT] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .poll_exclusive = 1,
- },
- [IORING_OP_ASYNC_CANCEL] = {
- .audit_skip = 1,
- },
- [IORING_OP_LINK_TIMEOUT] = {
- .audit_skip = 1,
- .async_size = sizeof(struct io_timeout_data),
- },
- [IORING_OP_CONNECT] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .needs_async_setup = 1,
- .async_size = sizeof(struct io_async_connect),
- },
- [IORING_OP_FALLOCATE] = {
- .needs_file = 1,
- },
- [IORING_OP_OPENAT] = {},
- [IORING_OP_CLOSE] = {},
- [IORING_OP_FILES_UPDATE] = {
- .audit_skip = 1,
- },
- [IORING_OP_STATX] = {
- .audit_skip = 1,
- },
- [IORING_OP_READ] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .plug = 1,
- .audit_skip = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_WRITE] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .plug = 1,
- .audit_skip = 1,
- .async_size = sizeof(struct io_async_rw),
- },
- [IORING_OP_FADVISE] = {
- .needs_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_MADVISE] = {},
- [IORING_OP_SEND] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollout = 1,
- .audit_skip = 1,
- },
- [IORING_OP_RECV] = {
- .needs_file = 1,
- .unbound_nonreg_file = 1,
- .pollin = 1,
- .buffer_select = 1,
- .audit_skip = 1,
- },
- [IORING_OP_OPENAT2] = {
- },
- [IORING_OP_EPOLL_CTL] = {
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_SPLICE] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_PROVIDE_BUFFERS] = {
- .audit_skip = 1,
- },
- [IORING_OP_REMOVE_BUFFERS] = {
- .audit_skip = 1,
- },
- [IORING_OP_TEE] = {
- .needs_file = 1,
- .hash_reg_file = 1,
- .unbound_nonreg_file = 1,
- .audit_skip = 1,
- },
- [IORING_OP_SHUTDOWN] = {
- .needs_file = 1,
- },
- [IORING_OP_RENAMEAT] = {},
- [IORING_OP_UNLINKAT] = {},
- [IORING_OP_MKDIRAT] = {},
- [IORING_OP_SYMLINKAT] = {},
- [IORING_OP_LINKAT] = {},
- [IORING_OP_MSG_RING] = {
- .needs_file = 1,
- },
-};
-
-/* requests with any of those set should undergo io_disarm_next() */
-#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
-
-static bool io_disarm_next(struct io_kiocb *req);
-static void io_uring_del_tctx_node(unsigned long index);
-static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all);
-static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-
-static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
-
-static void io_put_req(struct io_kiocb *req);
-static void io_put_req_deferred(struct io_kiocb *req);
-static void io_dismantle_req(struct io_kiocb *req);
-static void io_queue_linked_timeout(struct io_kiocb *req);
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
- struct io_uring_rsrc_update2 *up,
- unsigned nr_args);
-static void io_clean_op(struct io_kiocb *req);
-static struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed);
-static void __io_queue_sqe(struct io_kiocb *req);
-static void io_rsrc_put_work(struct work_struct *work);
-
-static void io_req_task_queue(struct io_kiocb *req);
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
-static int io_req_prep_async(struct io_kiocb *req);
-
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
- unsigned int issue_flags, u32 slot_index);
-static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
-
-static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
-static void io_eventfd_signal(struct io_ring_ctx *ctx);
-
-static struct kmem_cache *req_cachep;
-
-static const struct file_operations io_uring_fops;
-
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
- if (file->f_op == &io_uring_fops) {
- struct io_ring_ctx *ctx = file->private_data;
-
- return ctx->ring_sock->sk;
- }
-#endif
- return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
-static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
-{
- if (!*locked) {
- mutex_lock(&ctx->uring_lock);
- *locked = true;
- }
-}
-
-#define io_for_each_link(pos, head) \
- for (pos = (head); pos; pos = pos->link)
-
-/*
- * Shamelessly stolen from the mm implementation of page reference checking,
- * see commit f958d7b528b1 for details.
- */
-#define req_ref_zero_or_close_to_overflow(req) \
- ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
-
-static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
- return atomic_inc_not_zero(&req->refs);
-}
-
-static inline bool req_ref_put_and_test(struct io_kiocb *req)
-{
- if (likely(!(req->flags & REQ_F_REFCOUNT)))
- return true;
-
- WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
- return atomic_dec_and_test(&req->refs);
-}
-
-static inline void req_ref_get(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
- WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
- atomic_inc(&req->refs);
-}
-
-static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
-{
- if (!wq_list_empty(&ctx->submit_state.compl_reqs))
- __io_submit_flush_completions(ctx);
-}
-
-static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
-{
- if (!(req->flags & REQ_F_REFCOUNT)) {
- req->flags |= REQ_F_REFCOUNT;
- atomic_set(&req->refs, nr);
- }
-}
-
-static inline void io_req_set_refcount(struct io_kiocb *req)
-{
- __io_req_set_refcount(req, 1);
-}
-
-#define IO_RSRC_REF_BATCH 100
-
-static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
- struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- struct percpu_ref *ref = req->fixed_rsrc_refs;
-
- if (ref) {
- if (ref == &ctx->rsrc_node->refs)
- ctx->rsrc_cached_refs++;
- else
- percpu_ref_put(ref);
- }
-}
-
-static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- if (req->fixed_rsrc_refs)
- percpu_ref_put(req->fixed_rsrc_refs);
-}
-
-static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- if (ctx->rsrc_cached_refs) {
- percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
- ctx->rsrc_cached_refs = 0;
- }
-}
-
-static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
- percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
-}
-
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
- struct io_ring_ctx *ctx)
-{
- if (!req->fixed_rsrc_refs) {
- req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
- ctx->rsrc_cached_refs--;
- if (unlikely(ctx->rsrc_cached_refs < 0))
- io_rsrc_refs_refill(ctx);
- }
-}
-
-static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
-{
- struct io_buffer *kbuf = req->kbuf;
- unsigned int cflags;
-
- cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- list_add(&kbuf->list, list);
- req->kbuf = NULL;
- return cflags;
-}
-
-static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
-{
- lockdep_assert_held(&req->ctx->completion_lock);
-
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
- return 0;
- return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
-}
-
-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
- unsigned issue_flags)
-{
- unsigned int cflags;
-
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
- return 0;
-
- /*
- * We can add this buffer back to two lists:
- *
- * 1) The io_buffers_cache list. This one is protected by the
- * ctx->uring_lock. If we already hold this lock, add back to this
- * list as we can grab it from issue as well.
- * 2) The io_buffers_comp list. This one is protected by the
- * ctx->completion_lock.
- *
- * We migrate buffers from the comp_list to the issue cache list
- * when we need one.
- */
- if (issue_flags & IO_URING_F_UNLOCKED) {
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock(&ctx->completion_lock);
- cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
- spin_unlock(&ctx->completion_lock);
- } else {
- lockdep_assert_held(&req->ctx->uring_lock);
-
- cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
- }
-
- return cflags;
-}
-
-static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
- unsigned int bgid)
-{
- struct list_head *hash_list;
- struct io_buffer_list *bl;
-
- hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- list_for_each_entry(bl, hash_list, list)
- if (bl->bgid == bgid || bgid == -1U)
- return bl;
-
- return NULL;
-}
-
-static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- struct io_buffer *buf;
-
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
- return;
- /* don't recycle if we already did IO to this buffer */
- if (req->flags & REQ_F_PARTIAL_IO)
- return;
-
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_lock(&ctx->uring_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
-
- buf = req->kbuf;
- bl = io_buffer_get_list(ctx, buf->bgid);
- list_add(&buf->list, &bl->buf_list);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->kbuf = NULL;
-
- if (issue_flags & IO_URING_F_UNLOCKED)
- mutex_unlock(&ctx->uring_lock);
-}
-
-static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
- bool cancel_all)
- __must_hold(&req->ctx->timeout_lock)
-{
- struct io_kiocb *req;
-
- if (task && head->task != task)
- return false;
- if (cancel_all)
- return true;
-
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
-}
-
-static bool io_match_linked(struct io_kiocb *head)
-{
- struct io_kiocb *req;
-
- io_for_each_link(req, head) {
- if (req->flags & REQ_F_INFLIGHT)
- return true;
- }
- return false;
-}
-
-/*
- * As io_match_task() but protected against racing with linked timeouts.
- * User must not hold timeout_lock.
- */
-static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
- bool cancel_all)
-{
- bool matched;
-
- if (task && head->task != task)
- return false;
- if (cancel_all)
- return true;
-
- if (head->flags & REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = head->ctx;
-
- /* protect against races with linked timeouts */
- spin_lock_irq(&ctx->timeout_lock);
- matched = io_match_linked(head);
- spin_unlock_irq(&ctx->timeout_lock);
- } else {
- matched = io_match_linked(head);
- }
- return matched;
-}
-
-static inline bool req_has_async_data(struct io_kiocb *req)
-{
- return req->flags & REQ_F_ASYNC_DATA;
-}
-
-static inline void req_set_fail(struct io_kiocb *req)
-{
- req->flags |= REQ_F_FAIL;
- if (req->flags & REQ_F_CQE_SKIP) {
- req->flags &= ~REQ_F_CQE_SKIP;
- req->flags |= REQ_F_SKIP_LINK_CQES;
- }
-}
-
-static inline void req_fail_link_node(struct io_kiocb *req, int res)
-{
- req_set_fail(req);
- req->result = res;
-}
-
-static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
-{
- struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
-
- complete(&ctx->ref_comp);
-}
-
-static inline bool io_is_timeout_noseq(struct io_kiocb *req)
-{
- return !req->timeout.off;
-}
-
-static __cold void io_fallback_req_func(struct work_struct *work)
-{
- struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
- fallback_work.work);
- struct llist_node *node = llist_del_all(&ctx->fallback_llist);
- struct io_kiocb *req, *tmp;
- bool locked = false;
-
- percpu_ref_get(&ctx->refs);
- llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
- req->io_task_work.func(req, &locked);
-
- if (locked) {
- io_submit_flush_completions(ctx);
- mutex_unlock(&ctx->uring_lock);
- }
- percpu_ref_put(&ctx->refs);
-}
-
-static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
-{
- struct io_ring_ctx *ctx;
- int i, hash_bits;
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return NULL;
-
- /*
- * Use 5 bits less than the max cq entries, that should give us around
- * 32 entries per hash list if totally full and uniformly spread.
- */
- hash_bits = ilog2(p->cq_entries);
- hash_bits -= 5;
- if (hash_bits <= 0)
- hash_bits = 1;
- ctx->cancel_hash_bits = hash_bits;
- ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!ctx->cancel_hash)
- goto err;
- __hash_init(ctx->cancel_hash, 1U << hash_bits);
-
- ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
- if (!ctx->dummy_ubuf)
- goto err;
- /* set invalid range, so io_import_fixed() fails meeting it */
- ctx->dummy_ubuf->ubuf = -1UL;
-
- ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
- sizeof(struct list_head), GFP_KERNEL);
- if (!ctx->io_buffers)
- goto err;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
- INIT_LIST_HEAD(&ctx->io_buffers[i]);
-
- if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
- goto err;
-
- ctx->flags = p->flags;
- init_waitqueue_head(&ctx->sqo_sq_wait);
- INIT_LIST_HEAD(&ctx->sqd_list);
- INIT_LIST_HEAD(&ctx->cq_overflow_list);
- INIT_LIST_HEAD(&ctx->io_buffers_cache);
- INIT_LIST_HEAD(&ctx->apoll_cache);
- init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
- mutex_init(&ctx->uring_lock);
- init_waitqueue_head(&ctx->cq_wait);
- spin_lock_init(&ctx->completion_lock);
- spin_lock_init(&ctx->timeout_lock);
- INIT_WQ_LIST(&ctx->iopoll_list);
- INIT_LIST_HEAD(&ctx->io_buffers_pages);
- INIT_LIST_HEAD(&ctx->io_buffers_comp);
- INIT_LIST_HEAD(&ctx->defer_list);
- INIT_LIST_HEAD(&ctx->timeout_list);
- INIT_LIST_HEAD(&ctx->ltimeout_list);
- spin_lock_init(&ctx->rsrc_ref_lock);
- INIT_LIST_HEAD(&ctx->rsrc_ref_list);
- INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
- init_llist_head(&ctx->rsrc_put_llist);
- INIT_LIST_HEAD(&ctx->tctx_list);
- ctx->submit_state.free_list.next = NULL;
- INIT_WQ_LIST(&ctx->locked_free_list);
- INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
- INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
-#ifdef CONFIG_NET_RX_BUSY_POLL
- INIT_LIST_HEAD(&ctx->napi_list);
- spin_lock_init(&ctx->napi_lock);
-#endif
- return ctx;
-err:
- kfree(ctx->dummy_ubuf);
- kfree(ctx->cancel_hash);
- kfree(ctx->io_buffers);
- kfree(ctx);
- return NULL;
-}
-
-static void io_account_cq_overflow(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
- ctx->cq_extra--;
-}
-
-static bool req_need_defer(struct io_kiocb *req, u32 seq)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
- }
-
- return false;
-}
-
-#define FFS_NOWAIT 0x1UL
-#define FFS_ISREG 0x2UL
-#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
-
-static inline bool io_req_ffs_set(struct io_kiocb *req)
-{
- return req->flags & REQ_F_FIXED_FILE;
-}
-
-static inline void io_req_track_inflight(struct io_kiocb *req)
-{
- if (!(req->flags & REQ_F_INFLIGHT)) {
- req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&current->io_uring->inflight_tracked);
- }
-}
-
-static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
-{
- if (WARN_ON_ONCE(!req->link))
- return NULL;
-
- req->flags &= ~REQ_F_ARM_LTIMEOUT;
- req->flags |= REQ_F_LINK_TIMEOUT;
-
- /* linked timeouts should have two refs once prep'ed */
- io_req_set_refcount(req);
- __io_req_set_refcount(req->link, 2);
- return req->link;
-}
-
-static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
-{
- if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
- return NULL;
- return __io_prep_linked_timeout(req);
-}
-
-static void io_prep_async_work(struct io_kiocb *req)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!(req->flags & REQ_F_CREDS)) {
- req->flags |= REQ_F_CREDS;
- req->creds = get_current_cred();
- }
-
- req->work.list.next = NULL;
- req->work.flags = 0;
- if (req->flags & REQ_F_FORCE_ASYNC)
- req->work.flags |= IO_WQ_WORK_CONCURRENT;
-
- if (req->flags & REQ_F_ISREG) {
- if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
- io_wq_hash_work(&req->work, file_inode(req->file));
- } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
- if (def->unbound_nonreg_file)
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- }
-
- switch (req->opcode) {
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
- req->work.flags |= IO_WQ_WORK_UNBOUND;
- break;
- }
-}
-
-static void io_prep_async_link(struct io_kiocb *req)
-{
- struct io_kiocb *cur;
-
- if (req->flags & REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->timeout_lock);
- io_for_each_link(cur, req)
- io_prep_async_work(cur);
- spin_unlock_irq(&ctx->timeout_lock);
- } else {
- io_for_each_link(cur, req)
- io_prep_async_work(cur);
- }
-}
-
-static inline void io_req_add_compl_list(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_submit_state *state = &ctx->submit_state;
-
- if (!(req->flags & REQ_F_CQE_SKIP))
- ctx->submit_state.flush_cqes = true;
- wq_list_add_tail(&req->comp_list, &state->compl_reqs);
-}
-
-static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *link = io_prep_linked_timeout(req);
- struct io_uring_task *tctx = req->task->io_uring;
-
- BUG_ON(!tctx);
- BUG_ON(!tctx->io_wq);
-
- /* init ->work of the whole link before punting */
- io_prep_async_link(req);
-
- /*
- * Not expected to happen, but if we do have a bug where this _can_
- * happen, catch it here and ensure the request is marked as
- * canceled. That will make io-wq go through the usual work cancel
- * procedure rather than attempt to run this request (or create a new
- * worker for it).
- */
- if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
- req->work.flags |= IO_WQ_WORK_CANCEL;
-
- trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
- &req->work, io_wq_is_hashed(&req->work));
- io_wq_enqueue(tctx->io_wq, &req->work);
- if (link)
- io_queue_linked_timeout(link);
-}
-
-static void io_kill_timeout(struct io_kiocb *req, int status)
- __must_hold(&req->ctx->completion_lock)
- __must_hold(&req->ctx->timeout_lock)
-{
- struct io_timeout_data *io = req->async_data;
-
- if (hrtimer_try_to_cancel(&io->timer) != -1) {
- if (status)
- req_set_fail(req);
- atomic_set(&req->ctx->cq_timeouts,
- atomic_read(&req->ctx->cq_timeouts) + 1);
- list_del_init(&req->timeout.list);
- io_fill_cqe_req(req, status, 0);
- io_put_req_deferred(req);
- }
-}
-
-static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
-{
- while (!list_empty(&ctx->defer_list)) {
- struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
- struct io_defer_entry, list);
-
- if (req_need_defer(de->req, de->seq))
- break;
- list_del_init(&de->list);
- io_req_task_queue(de->req);
- kfree(de);
- }
-}
-
-static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
- __must_hold(&ctx->completion_lock)
-{
- u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-
- spin_lock_irq(&ctx->timeout_lock);
- while (!list_empty(&ctx->timeout_list)) {
- u32 events_needed, events_got;
- struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, timeout.list);
-
- if (io_is_timeout_noseq(req))
- break;
-
- /*
- * Since seq can easily wrap around over time, subtract
- * the last seq at which timeouts were flushed before comparing.
- * Assuming not more than 2^31-1 events have happened since,
- * these subtractions won't have wrapped, so we can check if
- * target is in [last_seq, current_seq] by comparing the two.
- */
- events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
- events_got = seq - ctx->cq_last_tm_flush;
- if (events_got < events_needed)
- break;
-
- list_del_init(&req->timeout.list);
- io_kill_timeout(req, 0);
- }
- ctx->cq_last_tm_flush = seq;
- spin_unlock_irq(&ctx->timeout_lock);
-}
-
-static inline void io_commit_cqring(struct io_ring_ctx *ctx)
-{
- /* order cqe stores with ring update */
- smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
-}
-
-static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used || ctx->drain_active) {
- spin_lock(&ctx->completion_lock);
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- }
- if (ctx->has_evfd)
- io_eventfd_signal(ctx);
-}
-
-static inline bool io_sqring_full(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
-}
-
-static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
-{
- return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
-}
-
-static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
-{
- struct io_rings *rings = ctx->rings;
- unsigned tail, mask = ctx->cq_entries - 1;
-
- /*
- * writes to the cq entry need to come after reading head; the
- * control dependency is enough as we're using WRITE_ONCE to
- * fill the cq entry
- */
- if (__io_cqring_events(ctx) == ctx->cq_entries)
- return NULL;
-
- tail = ctx->cached_cq_tail++;
- return &rings->cqes[tail & mask];
-}
-
-static void io_eventfd_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- rcu_read_lock();
- /*
- * rcu_dereference ctx->io_ev_fd once and use it for both for checking
- * and eventfd_signal
- */
- ev_fd = rcu_dereference(ctx->io_ev_fd);
-
- /*
- * Check again if ev_fd exists incase an io_eventfd_unregister call
- * completed between the NULL check of ctx->io_ev_fd at the start of
- * the function and rcu_read_lock.
- */
- if (unlikely(!ev_fd))
- goto out;
- if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- goto out;
-
- if (!ev_fd->eventfd_async || io_wq_current_is_worker())
- eventfd_signal(ev_fd->cq_ev_fd, 1);
-out:
- rcu_read_unlock();
-}
-
-static inline void io_cqring_wake(struct io_ring_ctx *ctx)
-{
- /*
- * wake_up_all() may seem excessive, but io_wake_function() and
- * io_should_wake() handle the termination of the loop and only
- * wake as many waiters as we need to.
- */
- if (wq_has_sleeper(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
-}
-
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
- ctx->has_evfd))
- __io_commit_cqring_flush(ctx);
-
- io_cqring_wake(ctx);
-}
-
-static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
-{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
- ctx->has_evfd))
- __io_commit_cqring_flush(ctx);
-
- if (ctx->flags & IORING_SETUP_SQPOLL)
- io_cqring_wake(ctx);
-}
-
-/* Returns true if there are no backlogged entries after the flush */
-static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
-{
- bool all_flushed, posted;
-
- if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
- return false;
-
- posted = false;
- spin_lock(&ctx->completion_lock);
- while (!list_empty(&ctx->cq_overflow_list)) {
- struct io_uring_cqe *cqe = io_get_cqe(ctx);
- struct io_overflow_cqe *ocqe;
-
- if (!cqe && !force)
- break;
- ocqe = list_first_entry(&ctx->cq_overflow_list,
- struct io_overflow_cqe, list);
- if (cqe)
- memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
- else
- io_account_cq_overflow(ctx);
-
- posted = true;
- list_del(&ocqe->list);
- kfree(ocqe);
- }
-
- all_flushed = list_empty(&ctx->cq_overflow_list);
- if (all_flushed) {
- clear_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
- }
-
- if (posted)
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
- return all_flushed;
-}
-
-static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
-{
- bool ret = true;
-
- if (test_bit(0, &ctx->check_cq_overflow)) {
- /* iopoll syncs against uring_lock, not completion_lock */
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_lock(&ctx->uring_lock);
- ret = __io_cqring_overflow_flush(ctx, false);
- if (ctx->flags & IORING_SETUP_IOPOLL)
- mutex_unlock(&ctx->uring_lock);
- }
-
- return ret;
-}
-
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
- struct io_uring_task *tctx = task->io_uring;
-
- if (likely(task == current)) {
- tctx->cached_refs += nr;
- } else {
- percpu_counter_sub(&tctx->inflight, nr);
- if (unlikely(atomic_read(&tctx->in_idle)))
- wake_up(&tctx->wait);
- put_task_struct_many(task, nr);
- }
-}
-
-static void io_task_refs_refill(struct io_uring_task *tctx)
-{
- unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
-
- percpu_counter_add(&tctx->inflight, refill);
- refcount_add(refill, &current->usage);
- tctx->cached_refs += refill;
-}
-
-static inline void io_get_task_refs(int nr)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- tctx->cached_refs -= nr;
- if (unlikely(tctx->cached_refs < 0))
- io_task_refs_refill(tctx);
-}
-
-static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
-{
- struct io_uring_task *tctx = task->io_uring;
- unsigned int refs = tctx->cached_refs;
-
- if (refs) {
- tctx->cached_refs = 0;
- percpu_counter_sub(&tctx->inflight, refs);
- put_task_struct_many(task, refs);
- }
-}
-
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
-{
- struct io_overflow_cqe *ocqe;
-
- ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
- if (!ocqe) {
- /*
- * If we're in ring overflow flush mode, or in task cancel mode,
- * or cannot allocate an overflow entry, then we need to drop it
- * on the floor.
- */
- io_account_cq_overflow(ctx);
- return false;
- }
- if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->check_cq_overflow);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
-
- }
- ocqe->cqe.user_data = user_data;
- ocqe->cqe.res = res;
- ocqe->cqe.flags = cflags;
- list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
- return true;
-}
-
-static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
-{
- struct io_uring_cqe *cqe;
-
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- WRITE_ONCE(cqe->user_data, user_data);
- WRITE_ONCE(cqe->res, res);
- WRITE_ONCE(cqe->flags, cflags);
- return true;
- }
- return io_cqring_event_overflow(ctx, user_data, res, cflags);
-}
-
-static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
-{
- trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
- return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
-}
-
-static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
-{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
-}
-
-static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
-{
- ctx->cq_extra++;
- trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
- return __io_fill_cqe(ctx, user_data, res, cflags);
-}
-
-static void __io_req_complete_post(struct io_kiocb *req, s32 res,
- u32 cflags)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
- /*
- * If we're the last reference to this request, add to our locked
- * free_list cache.
- */
- if (req_ref_put_and_test(req)) {
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- if (req->flags & IO_DISARM_MASK)
- io_disarm_next(req);
- if (req->link) {
- io_req_task_queue(req->link);
- req->link = NULL;
- }
- }
- io_req_put_rsrc(req, ctx);
- /*
- * Selected buffer deallocation in io_clean_op() assumes that
- * we don't hold ->completion_lock. Clean them here to avoid
- * deadlocks.
- */
- io_put_kbuf_comp(req);
- io_dismantle_req(req);
- io_put_task(req->task, 1);
- wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
- ctx->locked_free_nr++;
- }
-}
-
-static void io_req_complete_post(struct io_kiocb *req, s32 res,
- u32 cflags)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock(&ctx->completion_lock);
- __io_req_complete_post(req, res, cflags);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
-static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
- u32 cflags)
-{
- req->result = res;
- req->cflags = cflags;
- req->flags |= REQ_F_COMPLETE_INLINE;
-}
-
-static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
- s32 res, u32 cflags)
-{
- if (issue_flags & IO_URING_F_COMPLETE_DEFER)
- io_req_complete_state(req, res, cflags);
- else
- io_req_complete_post(req, res, cflags);
-}
-
-static inline void io_req_complete(struct io_kiocb *req, s32 res)
-{
- __io_req_complete(req, 0, res, 0);
-}
-
-static void io_req_complete_failed(struct io_kiocb *req, s32 res)
-{
- req_set_fail(req);
- io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
-}
-
-static void io_req_complete_fail_submit(struct io_kiocb *req)
-{
- /*
- * We don't submit, fail them all, for that replace hardlinks with
- * normal links. Extra REQ_F_LINK is tolerated.
- */
- req->flags &= ~REQ_F_HARDLINK;
- req->flags |= REQ_F_LINK;
- io_req_complete_failed(req, req->result);
-}
-
-/*
- * Don't initialise the fields below on every allocation, but do that in
- * advance and keep them valid across allocations.
- */
-static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- req->ctx = ctx;
- req->link = NULL;
- req->async_data = NULL;
- /* not necessary, but safer to zero */
- req->result = 0;
-}
-
-static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
-{
- spin_lock(&ctx->completion_lock);
- wq_list_splice(&ctx->locked_free_list, &state->free_list);
- ctx->locked_free_nr = 0;
- spin_unlock(&ctx->completion_lock);
-}
-
-/* Returns true IFF there are requests in the cache */
-static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
-{
- struct io_submit_state *state = &ctx->submit_state;
-
- /*
- * If we have more than a batch's worth of requests in our IRQ side
- * locked cache, grab the lock and move them over to our submission
- * side cache.
- */
- if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
- io_flush_cached_locked_reqs(ctx, state);
- return !!state->free_list.next;
-}
-
-/*
- * A request might get retired back into the request caches even before opcode
- * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
- * Because of that, io_alloc_req() should be called only under ->uring_lock
- * and with extra caution to not get a request that is still worked on.
- */
-static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- struct io_submit_state *state = &ctx->submit_state;
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
- void *reqs[IO_REQ_ALLOC_BATCH];
- struct io_kiocb *req;
- int ret, i;
-
- if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
- return true;
-
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
-
- /*
- * Bulk alloc is all-or-nothing. If we fail to get a batch,
- * retry single alloc to be on the safe side.
- */
- if (unlikely(ret <= 0)) {
- reqs[0] = kmem_cache_alloc(req_cachep, gfp);
- if (!reqs[0])
- return false;
- ret = 1;
- }
-
- percpu_ref_get_many(&ctx->refs, ret);
- for (i = 0; i < ret; i++) {
- req = reqs[i];
-
- io_preinit_req(req, ctx);
- wq_stack_add_head(&req->comp_list, &state->free_list);
- }
- return true;
-}
-
-static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
-{
- if (unlikely(!ctx->submit_state.free_list.next))
- return __io_alloc_req_refill(ctx);
- return true;
-}
-
-static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
-{
- struct io_wq_work_node *node;
-
- node = wq_stack_extract(&ctx->submit_state.free_list);
- return container_of(node, struct io_kiocb, comp_list);
-}
-
-static inline void io_put_file(struct file *file)
-{
- if (file)
- fput(file);
-}
-
-static inline void io_dismantle_req(struct io_kiocb *req)
-{
- unsigned int flags = req->flags;
-
- if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
- io_clean_op(req);
- if (!(flags & REQ_F_FIXED_FILE))
- io_put_file(req->file);
-}
-
-static __cold void __io_free_req(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- io_req_put_rsrc(req, ctx);
- io_dismantle_req(req);
- io_put_task(req->task, 1);
-
- spin_lock(&ctx->completion_lock);
- wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
- ctx->locked_free_nr++;
- spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_remove_next_linked(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = req->link;
-
- req->link = nxt->link;
- nxt->link = NULL;
-}
-
-static bool io_kill_linked_timeout(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
- __must_hold(&req->ctx->timeout_lock)
-{
- struct io_kiocb *link = req->link;
-
- if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
- struct io_timeout_data *io = link->async_data;
-
- io_remove_next_linked(req);
- link->timeout.head = NULL;
- if (hrtimer_try_to_cancel(&io->timer) != -1) {
- list_del(&link->timeout.list);
- /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
- io_fill_cqe_req(link, -ECANCELED, 0);
- io_put_req_deferred(link);
- return true;
- }
- }
- return false;
-}
-
-static void io_fail_links(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
-{
- struct io_kiocb *nxt, *link = req->link;
- bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
-
- req->link = NULL;
- while (link) {
- long res = -ECANCELED;
-
- if (link->flags & REQ_F_FAIL)
- res = link->result;
-
- nxt = link->link;
- link->link = NULL;
-
- trace_io_uring_fail_link(req->ctx, req, req->user_data,
- req->opcode, link);
-
- if (!ignore_cqes) {
- link->flags &= ~REQ_F_CQE_SKIP;
- io_fill_cqe_req(link, res, 0);
- }
- io_put_req_deferred(link);
- link = nxt;
- }
-}
-
-static bool io_disarm_next(struct io_kiocb *req)
- __must_hold(&req->ctx->completion_lock)
-{
- bool posted = false;
-
- if (req->flags & REQ_F_ARM_LTIMEOUT) {
- struct io_kiocb *link = req->link;
-
- req->flags &= ~REQ_F_ARM_LTIMEOUT;
- if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
- io_remove_next_linked(req);
- /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
- io_fill_cqe_req(link, -ECANCELED, 0);
- io_put_req_deferred(link);
- posted = true;
- }
- } else if (req->flags & REQ_F_LINK_TIMEOUT) {
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->timeout_lock);
- posted = io_kill_linked_timeout(req);
- spin_unlock_irq(&ctx->timeout_lock);
- }
- if (unlikely((req->flags & REQ_F_FAIL) &&
- !(req->flags & REQ_F_HARDLINK))) {
- posted |= (req->link != NULL);
- io_fail_links(req);
- }
- return posted;
-}
-
-static void __io_req_find_next_prep(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- bool posted;
-
- spin_lock(&ctx->completion_lock);
- posted = io_disarm_next(req);
- if (posted)
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (posted)
- io_cqring_ev_posted(ctx);
-}
-
-static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
-
- if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
- return NULL;
- /*
- * If LINK is set, we have dependent requests in this chain. If we
- * didn't fail this request, queue the first one up, moving any other
- * dependencies to the next request. In case of failure, fail the rest
- * of the chain.
- */
- if (unlikely(req->flags & IO_DISARM_MASK))
- __io_req_find_next_prep(req);
- nxt = req->link;
- req->link = NULL;
- return nxt;
-}
-
-static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
-{
- if (!ctx)
- return;
- if (*locked) {
- io_submit_flush_completions(ctx);
- mutex_unlock(&ctx->uring_lock);
- *locked = false;
- }
- percpu_ref_put(&ctx->refs);
-}
-
-static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
-{
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
-static void handle_prev_tw_list(struct io_wq_work_node *node,
- struct io_ring_ctx **ctx, bool *uring_locked)
-{
- if (*ctx && !*uring_locked)
- spin_lock(&(*ctx)->completion_lock);
-
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
- if (req->ctx != *ctx) {
- if (unlikely(!*uring_locked && *ctx))
- ctx_commit_and_unlock(*ctx);
-
- ctx_flush_and_put(*ctx, uring_locked);
- *ctx = req->ctx;
- /* if not contended, grab and improve batching */
- *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
- percpu_ref_get(&(*ctx)->refs);
- if (unlikely(!*uring_locked))
- spin_lock(&(*ctx)->completion_lock);
- }
- if (likely(*uring_locked))
- req->io_task_work.func(req, uring_locked);
- else
- __io_req_complete_post(req, req->result,
- io_put_kbuf_comp(req));
- node = next;
- } while (node);
-
- if (unlikely(!*uring_locked))
- ctx_commit_and_unlock(*ctx);
-}
-
-static void handle_tw_list(struct io_wq_work_node *node,
- struct io_ring_ctx **ctx, bool *locked)
-{
- do {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
- if (req->ctx != *ctx) {
- ctx_flush_and_put(*ctx, locked);
- *ctx = req->ctx;
- /* if not contended, grab and improve batching */
- *locked = mutex_trylock(&(*ctx)->uring_lock);
- percpu_ref_get(&(*ctx)->refs);
- }
- req->io_task_work.func(req, locked);
- node = next;
- } while (node);
-}
-
-static void tctx_task_work(struct callback_head *cb)
-{
- bool uring_locked = false;
- struct io_ring_ctx *ctx = NULL;
- struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
- task_work);
-
- while (1) {
- struct io_wq_work_node *node1, *node2;
-
- if (!tctx->task_list.first &&
- !tctx->prior_task_list.first && uring_locked)
- io_submit_flush_completions(ctx);
-
- spin_lock_irq(&tctx->task_lock);
- node1 = tctx->prior_task_list.first;
- node2 = tctx->task_list.first;
- INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prior_task_list);
- if (!node2 && !node1)
- tctx->task_running = false;
- spin_unlock_irq(&tctx->task_lock);
- if (!node2 && !node1)
- break;
-
- if (node1)
- handle_prev_tw_list(node1, &ctx, &uring_locked);
-
- if (node2)
- handle_tw_list(node2, &ctx, &uring_locked);
- cond_resched();
- }
-
- ctx_flush_and_put(ctx, &uring_locked);
-
- /* relaxed read is enough as only the task itself sets ->in_idle */
- if (unlikely(atomic_read(&tctx->in_idle)))
- io_uring_drop_tctx_refs(current);
-}
-
-static void io_req_task_work_add(struct io_kiocb *req, bool priority)
-{
- struct task_struct *tsk = req->task;
- struct io_uring_task *tctx = tsk->io_uring;
- enum task_work_notify_mode notify;
- struct io_wq_work_node *node;
- unsigned long flags;
- bool running;
-
- WARN_ON_ONCE(!tctx);
-
- spin_lock_irqsave(&tctx->task_lock, flags);
- if (priority)
- wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
- else
- wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
- running = tctx->task_running;
- if (!running)
- tctx->task_running = true;
- spin_unlock_irqrestore(&tctx->task_lock, flags);
-
- /* task_work already pending, we're done */
- if (running)
- return;
-
- /*
- * SQPOLL kernel thread doesn't need notification, just a wakeup. For
- * all other cases, use TWA_SIGNAL unconditionally to ensure we're
- * processing task_work. There's no reliable way to tell if TWA_RESUME
- * will do the job.
- */
- notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
- if (notify == TWA_NONE)
- wake_up_process(tsk);
- return;
- }
-
- spin_lock_irqsave(&tctx->task_lock, flags);
- tctx->task_running = false;
- node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
- spin_unlock_irqrestore(&tctx->task_lock, flags);
-
- while (node) {
- req = container_of(node, struct io_kiocb, io_task_work.node);
- node = node->next;
- if (llist_add(&req->io_task_work.fallback_node,
- &req->ctx->fallback_llist))
- schedule_delayed_work(&req->ctx->fallback_work, 1);
- }
-}
-
-static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- /* not needed for normal modes, but SQPOLL depends on it */
- io_tw_lock(ctx, locked);
- io_req_complete_failed(req, req->result);
-}
-
-static void io_req_task_submit(struct io_kiocb *req, bool *locked)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- io_tw_lock(ctx, locked);
- /* req->task == current here, checking PF_EXITING is safe */
- if (likely(!(req->task->flags & PF_EXITING)))
- __io_queue_sqe(req);
- else
- io_req_complete_failed(req, -EFAULT);
-}
-
-static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
-{
- req->result = ret;
- req->io_task_work.func = io_req_task_cancel;
- io_req_task_work_add(req, false);
-}
-
-static void io_req_task_queue(struct io_kiocb *req)
-{
- req->io_task_work.func = io_req_task_submit;
- io_req_task_work_add(req, false);
-}
-
-static void io_req_task_queue_reissue(struct io_kiocb *req)
-{
- req->io_task_work.func = io_queue_async_work;
- io_req_task_work_add(req, false);
-}
-
-static inline void io_queue_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = io_req_find_next(req);
-
- if (nxt)
- io_req_task_queue(nxt);
-}
-
-static void io_free_req(struct io_kiocb *req)
-{
- io_queue_next(req);
- __io_free_req(req);
-}
-
-static void io_free_req_work(struct io_kiocb *req, bool *locked)
-{
- io_free_req(req);
-}
-
-static void io_free_batch_list(struct io_ring_ctx *ctx,
- struct io_wq_work_node *node)
- __must_hold(&ctx->uring_lock)
-{
- struct task_struct *task = NULL;
- int task_refs = 0;
-
- do {
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- comp_list);
-
- if (unlikely(req->flags & REQ_F_REFCOUNT)) {
- node = req->comp_list.next;
- if (!req_ref_put_and_test(req))
- continue;
- }
-
- io_req_put_rsrc_locked(req, ctx);
- io_queue_next(req);
- io_dismantle_req(req);
-
- if (req->task != task) {
- if (task)
- io_put_task(task, task_refs);
- task = req->task;
- task_refs = 0;
- }
- task_refs++;
- node = req->comp_list.next;
- wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
- } while (node);
-
- if (task)
- io_put_task(task, task_refs);
-}
-
-static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
- __must_hold(&ctx->uring_lock)
-{
- struct io_wq_work_node *node, *prev;
- struct io_submit_state *state = &ctx->submit_state;
-
- if (state->flush_cqes) {
- spin_lock(&ctx->completion_lock);
- wq_list_for_each(node, prev, &state->compl_reqs) {
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- comp_list);
-
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, req->result, req->cflags);
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- struct async_poll *apoll = req->apoll;
-
- if (apoll->double_poll)
- kfree(apoll->double_poll);
- list_add(&apoll->poll.wait.entry,
- &ctx->apoll_cache);
- req->flags &= ~REQ_F_POLLED;
- }
- }
-
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
- state->flush_cqes = false;
- }
-
- io_free_batch_list(ctx, state->compl_reqs.first);
- INIT_WQ_LIST(&state->compl_reqs);
-}
-
-/*
- * Drop reference to request, return next in chain (if there is one) if this
- * was the last reference to this request.
- */
-static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt = NULL;
-
- if (req_ref_put_and_test(req)) {
- nxt = io_req_find_next(req);
- __io_free_req(req);
- }
- return nxt;
-}
-
-static inline void io_put_req(struct io_kiocb *req)
-{
- if (req_ref_put_and_test(req))
- io_free_req(req);
-}
-
-static inline void io_put_req_deferred(struct io_kiocb *req)
-{
- if (req_ref_put_and_test(req)) {
- req->io_task_work.func = io_free_req_work;
- io_req_task_work_add(req, false);
- }
-}
-
-static unsigned io_cqring_events(struct io_ring_ctx *ctx)
-{
- /* See comment at the top of this file */
- smp_rmb();
- return __io_cqring_events(ctx);
-}
-
-static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
-{
- struct io_rings *rings = ctx->rings;
-
- /* make sure SQ entry isn't read before tail */
- return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
-}
-
-static inline bool io_run_task_work(void)
-{
- if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
- __set_current_state(TASK_RUNNING);
- clear_notify_signal();
- if (task_work_pending(current))
- task_work_run();
- return true;
- }
-
- return false;
-}
-
-static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
-{
- struct io_wq_work_node *pos, *start, *prev;
- unsigned int poll_flags = BLK_POLL_NOSLEEP;
- DEFINE_IO_COMP_BATCH(iob);
- int nr_events = 0;
-
- /*
- * Only spin for completions if we don't have multiple devices hanging
- * off our complete list.
- */
- if (ctx->poll_multi_queue || force_nonspin)
- poll_flags |= BLK_POLL_ONESHOT;
-
- wq_list_for_each(pos, start, &ctx->iopoll_list) {
- struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
- struct kiocb *kiocb = &req->rw.kiocb;
- int ret;
-
- /*
- * Move completed and retryable entries to our local lists.
- * If we find a request that requires polling, break out
- * and complete those lists first, if we have entries there.
- */
- if (READ_ONCE(req->iopoll_completed))
- break;
-
- ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
- if (unlikely(ret < 0))
- return ret;
- else if (ret)
- poll_flags |= BLK_POLL_ONESHOT;
-
- /* iopoll may have completed current req */
- if (!rq_list_empty(iob.req_list) ||
- READ_ONCE(req->iopoll_completed))
- break;
- }
-
- if (!rq_list_empty(iob.req_list))
- iob.complete(&iob);
- else if (!pos)
- return 0;
-
- prev = start;
- wq_list_for_each_resume(pos, prev) {
- struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-
- /* order with io_complete_rw_iopoll(), e.g. ->result updates */
- if (!smp_load_acquire(&req->iopoll_completed))
- break;
- if (unlikely(req->flags & REQ_F_CQE_SKIP))
- continue;
-
- __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
- nr_events++;
- }
-
- if (unlikely(!nr_events))
- return 0;
-
- io_commit_cqring(ctx);
- io_cqring_ev_posted_iopoll(ctx);
- pos = start ? start->next : ctx->iopoll_list.first;
- wq_list_cut(&ctx->iopoll_list, prev, start);
- io_free_batch_list(ctx, pos);
- return nr_events;
-}
-
-/*
- * We can't just wait for polled events to come to us, we have to actively
- * find and complete them.
- */
-static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_IOPOLL))
- return;
-
- mutex_lock(&ctx->uring_lock);
- while (!wq_list_empty(&ctx->iopoll_list)) {
- /* let it sleep and repeat later if can't complete a request */
- if (io_do_iopoll(ctx, true) == 0)
- break;
- /*
- * Ensure we allow local-to-the-cpu processing to take place,
- * in this case we need to ensure that we reap all events.
- * Also let task_work, etc. to progress by releasing the mutex
- */
- if (need_resched()) {
- mutex_unlock(&ctx->uring_lock);
- cond_resched();
- mutex_lock(&ctx->uring_lock);
- }
- }
- mutex_unlock(&ctx->uring_lock);
-}
-
-static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
-{
- unsigned int nr_events = 0;
- int ret = 0;
-
- /*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
- /*
- * Don't enter poll loop if we already have events pending.
- * If we do, we can potentially be spinning for commands that
- * already triggered a CQE (eg in error).
- */
- if (test_bit(0, &ctx->check_cq_overflow))
- __io_cqring_overflow_flush(ctx, false);
- if (io_cqring_events(ctx))
- goto out;
- do {
- /*
- * If a submit got punted to a workqueue, we can have the
- * application entering polling for a command before it gets
- * issued. That app will hold the uring_lock for the duration
- * of the poll right here, so we need to take a breather every
- * now and then to ensure that the issue has a chance to add
- * the poll to the issued list. Otherwise we can spin here
- * forever, while the workqueue is stuck trying to acquire the
- * very same mutex.
- */
- if (wq_list_empty(&ctx->iopoll_list)) {
- u32 tail = ctx->cached_cq_tail;
-
- mutex_unlock(&ctx->uring_lock);
- io_run_task_work();
- mutex_lock(&ctx->uring_lock);
-
- /* some requests don't go through iopoll_list */
- if (tail != ctx->cached_cq_tail ||
- wq_list_empty(&ctx->iopoll_list))
- break;
- }
- ret = io_do_iopoll(ctx, !min);
- if (ret < 0)
- break;
- nr_events += ret;
- ret = 0;
- } while (nr_events < min && !need_resched());
-out:
- mutex_unlock(&ctx->uring_lock);
- return ret;
-}
-
-static void kiocb_end_write(struct io_kiocb *req)
-{
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
- if (req->flags & REQ_F_ISREG) {
- struct super_block *sb = file_inode(req->file)->i_sb;
-
- __sb_writers_acquired(sb, SB_FREEZE_WRITE);
- sb_end_write(sb);
- }
-}
-
-#ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
- struct io_async_rw *rw = req->async_data;
-
- if (!req_has_async_data(req))
- return !io_req_prep_async(req);
- iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
- return true;
-}
-
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
- umode_t mode = file_inode(req->file)->i_mode;
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!S_ISBLK(mode) && !S_ISREG(mode))
- return false;
- if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
- !(ctx->flags & IORING_SETUP_IOPOLL)))
- return false;
- /*
- * If ref is dying, we might be running poll reap from the exit work.
- * Don't attempt to reissue from that path, just let it fail with
- * -EAGAIN.
- */
- if (percpu_ref_is_dying(&ctx->refs))
- return false;
- /*
- * Play it safe and assume not safe to re-import and reissue if we're
- * not in the original thread group (or in task context).
- */
- if (!same_thread_group(req->task, current) || !in_task())
- return false;
- return true;
-}
-#else
-static bool io_resubmit_prep(struct io_kiocb *req)
-{
- return false;
-}
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
- return false;
-}
-#endif
-
-static bool __io_complete_rw_common(struct io_kiocb *req, long res)
-{
- if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
- kiocb_end_write(req);
- fsnotify_modify(req->file);
- } else {
- fsnotify_access(req->file);
- }
- if (unlikely(res != req->result)) {
- if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
- io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
- return true;
- }
- req_set_fail(req);
- req->result = res;
- }
- return false;
-}
-
-static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
-{
- int res = req->result;
-
- if (*locked) {
- io_req_complete_state(req, res, io_put_kbuf(req, 0));
- io_req_add_compl_list(req);
- } else {
- io_req_complete_post(req, res,
- io_put_kbuf(req, IO_URING_F_UNLOCKED));
- }
-}
-
-static void __io_complete_rw(struct io_kiocb *req, long res,
- unsigned int issue_flags)
-{
- if (__io_complete_rw_common(req, res))
- return;
- __io_req_complete(req, issue_flags, req->result,
- io_put_kbuf(req, issue_flags));
-}
-
-static void io_complete_rw(struct kiocb *kiocb, long res)
-{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
-
- if (__io_complete_rw_common(req, res))
- return;
- req->result = res;
- req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
-}
-
-static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
-{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
-
- if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
- if (unlikely(res != req->result)) {
- if (res == -EAGAIN && io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
- return;
- }
- req->result = res;
- }
-
- /* order with io_iopoll_complete() checking ->iopoll_completed */
- smp_store_release(&req->iopoll_completed, 1);
-}
-
-/*
- * After the iocb has been issued, it's safe to be found on the poll list.
- * Adding the kiocb to the list AFTER submission ensures that we don't
- * find it from a io_do_iopoll() thread before the issuer is done
- * accessing the kiocb cookie.
- */
-static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
-
- /* workqueue context doesn't hold uring_lock, grab it now */
- if (unlikely(needs_lock))
- mutex_lock(&ctx->uring_lock);
-
- /*
- * Track whether we have multiple files in our lists. This will impact
- * how we do polling eventually, not spinning if we're on potentially
- * different devices.
- */
- if (wq_list_empty(&ctx->iopoll_list)) {
- ctx->poll_multi_queue = false;
- } else if (!ctx->poll_multi_queue) {
- struct io_kiocb *list_req;
-
- list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
- comp_list);
- if (list_req->file != req->file)
- ctx->poll_multi_queue = true;
- }
-
- /*
- * For fast devices, IO may have already completed. If it has, add
- * it to the front so we find it first.
- */
- if (READ_ONCE(req->iopoll_completed))
- wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
- else
- wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
-
- if (unlikely(needs_lock)) {
- /*
- * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
- * in sq thread task context or in io worker task context. If
- * current task context is sq thread, we don't need to check
- * whether should wake up sq thread.
- */
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
- wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
-
- mutex_unlock(&ctx->uring_lock);
- }
-}
-
-static bool io_bdev_nowait(struct block_device *bdev)
-{
- return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
-}
-
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
-static bool __io_file_supports_nowait(struct file *file, umode_t mode)
-{
- if (S_ISBLK(mode)) {
- if (IS_ENABLED(CONFIG_BLOCK) &&
- io_bdev_nowait(I_BDEV(file->f_mapping->host)))
- return true;
- return false;
- }
- if (S_ISSOCK(mode))
- return true;
- if (S_ISREG(mode)) {
- if (IS_ENABLED(CONFIG_BLOCK) &&
- io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
- file->f_op != &io_uring_fops)
- return true;
- return false;
- }
-
- /* any ->read/write should understand O_NONBLOCK */
- if (file->f_flags & O_NONBLOCK)
- return true;
- return file->f_mode & FMODE_NOWAIT;
-}
-
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
-static unsigned int io_file_get_flags(struct file *file)
-{
- umode_t mode = file_inode(file)->i_mode;
- unsigned int res = 0;
-
- if (S_ISREG(mode))
- res |= FFS_ISREG;
- if (__io_file_supports_nowait(file, mode))
- res |= FFS_NOWAIT;
- return res;
-}
-
-static inline bool io_file_supports_nowait(struct io_kiocb *req)
-{
- return req->flags & REQ_F_SUPPORT_NOWAIT;
-}
-
-static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct kiocb *kiocb = &req->rw.kiocb;
- struct file *file = req->file;
- unsigned ioprio;
- int ret;
-
- if (!io_req_ffs_set(req))
- req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
-
- kiocb->ki_pos = READ_ONCE(sqe->off);
- kiocb->ki_flags = iocb_flags(file);
- ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
- if (unlikely(ret))
- return ret;
-
- /*
- * If the file is marked O_NONBLOCK, still allow retry for it if it
- * supports async. Otherwise it's impossible to use O_NONBLOCK files
- * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
- */
- if ((kiocb->ki_flags & IOCB_NOWAIT) ||
- ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
- req->flags |= REQ_F_NOWAIT;
-
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
- return -EOPNOTSUPP;
-
- kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
- kiocb->ki_complete = io_complete_rw_iopoll;
- req->iopoll_completed = 0;
- } else {
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EINVAL;
- kiocb->ki_complete = io_complete_rw;
- }
-
- ioprio = READ_ONCE(sqe->ioprio);
- if (ioprio) {
- ret = ioprio_check_cap(ioprio);
- if (ret)
- return ret;
-
- kiocb->ki_ioprio = ioprio;
- } else {
- kiocb->ki_ioprio = get_current_ioprio();
- }
-
- req->imu = NULL;
- req->rw.addr = READ_ONCE(sqe->addr);
- req->rw.len = READ_ONCE(sqe->len);
- req->buf_index = READ_ONCE(sqe->buf_index);
- return 0;
-}
-
-static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
-{
- switch (ret) {
- case -EIOCBQUEUED:
- break;
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- case -ERESTARTNOHAND:
- case -ERESTART_RESTARTBLOCK:
- /*
- * We can't just restart the syscall, since previously
- * submitted sqes may already be in progress. Just fail this
- * IO with EINTR.
- */
- ret = -EINTR;
- fallthrough;
- default:
- kiocb->ki_complete(kiocb, ret);
- }
-}
-
-static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
-{
- struct kiocb *kiocb = &req->rw.kiocb;
- bool is_stream = req->file->f_mode & FMODE_STREAM;
-
- if (kiocb->ki_pos == -1) {
- if (!is_stream) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = req->file->f_pos;
- return &kiocb->ki_pos;
- } else {
- kiocb->ki_pos = 0;
- return NULL;
- }
- }
- return is_stream ? NULL : &kiocb->ki_pos;
-}
-
-static void kiocb_done(struct io_kiocb *req, ssize_t ret,
- unsigned int issue_flags)
-{
- struct io_async_rw *io = req->async_data;
-
- /* add previously done IO, if any */
- if (req_has_async_data(req) && io->bytes_done > 0) {
- if (ret < 0)
- ret = io->bytes_done;
- else
- ret += io->bytes_done;
- }
-
- if (req->flags & REQ_F_CUR_POS)
- req->file->f_pos = req->rw.kiocb.ki_pos;
- if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
- __io_complete_rw(req, ret, issue_flags);
- else
- io_rw_done(&req->rw.kiocb, ret);
-
- if (req->flags & REQ_F_REISSUE) {
- req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req))
- io_req_task_queue_reissue(req);
- else
- io_req_task_queue_fail(req, ret);
- }
-}
-
-static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
- struct io_mapped_ubuf *imu)
-{
- size_t len = req->rw.len;
- u64 buf_end, buf_addr = req->rw.addr;
- size_t offset;
-
- if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
- return -EFAULT;
- /* not inside the mapped region */
- if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
- return -EFAULT;
-
- /*
- * May not be a start of buffer, set size appropriately
- * and advance us to the beginning.
- */
- offset = buf_addr - imu->ubuf;
- iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
-
- if (offset) {
- /*
- * Don't use iov_iter_advance() here, as it's really slow for
- * using the latter parts of a big fixed buffer - it iterates
- * over each segment manually. We can cheat a bit here, because
- * we know that:
- *
- * 1) it's a BVEC iter, we set it up
- * 2) all bvecs are PAGE_SIZE in size, except potentially the
- * first and last bvec
- *
- * So just find our index, and adjust the iterator afterwards.
- * If the offset is within the first bvec (or the whole first
- * bvec, just use iov_iter_advance(). This makes it easier
- * since we can just skip the first segment, which may not
- * be PAGE_SIZE aligned.
- */
- const struct bio_vec *bvec = imu->bvec;
-
- if (offset <= bvec->bv_len) {
- iov_iter_advance(iter, offset);
- } else {
- unsigned long seg_skip;
-
- /* skip first vec */
- offset -= bvec->bv_len;
- seg_skip = 1 + (offset >> PAGE_SHIFT);
-
- iter->bvec = bvec + seg_skip;
- iter->nr_segs -= seg_skip;
- iter->count -= bvec->bv_len + offset;
- iter->iov_offset = offset & ~PAGE_MASK;
- }
- }
-
- return 0;
-}
-
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
-{
- struct io_mapped_ubuf *imu = req->imu;
- u16 index, buf_index = req->buf_index;
-
- if (likely(!imu)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(buf_index >= ctx->nr_user_bufs))
- return -EFAULT;
- io_req_set_rsrc_node(req, ctx);
- index = array_index_nospec(buf_index, ctx->nr_user_bufs);
- imu = READ_ONCE(ctx->user_bufs[index]);
- req->imu = imu;
- }
- return __io_import_fixed(req, rw, iter, imu);
-}
-
-static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
-{
- if (needs_lock)
- mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
-{
- /*
- * "Normal" inline submissions always hold the uring_lock, since we
- * grab it from the system call. Same is true for the SQPOLL offload.
- * The only exception is when we've detached the request and issue it
- * from an async worker thread, grab the lock for that case.
- */
- if (needs_lock)
- mutex_lock(&ctx->uring_lock);
-}
-
-static void io_buffer_add_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned int bgid)
-{
- struct list_head *list;
-
- list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- INIT_LIST_HEAD(&bl->buf_list);
- bl->bgid = bgid;
- list_add(&bl->list, list);
-}
-
-static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, unsigned int issue_flags)
-{
- struct io_buffer *kbuf = req->kbuf;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
-
- if (req->flags & REQ_F_BUFFER_SELECTED)
- return kbuf;
-
- io_ring_submit_lock(ctx, needs_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
-
- bl = io_buffer_get_list(ctx, bgid);
- if (bl && !list_empty(&bl->buf_list)) {
- kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
- list_del(&kbuf->list);
- if (*len > kbuf->len)
- *len = kbuf->len;
- req->flags |= REQ_F_BUFFER_SELECTED;
- req->kbuf = kbuf;
- } else {
- kbuf = ERR_PTR(-ENOBUFS);
- }
-
- io_ring_submit_unlock(req->ctx, needs_lock);
- return kbuf;
-}
-
-static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
-{
- struct io_buffer *kbuf;
- u16 bgid;
-
- bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, issue_flags);
- if (IS_ERR(kbuf))
- return kbuf;
- return u64_to_user_ptr(kbuf->addr);
-}
-
-#ifdef CONFIG_COMPAT
-static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
- unsigned int issue_flags)
-{
- struct compat_iovec __user *uiov;
- compat_ssize_t clen;
- void __user *buf;
- ssize_t len;
-
- uiov = u64_to_user_ptr(req->rw.addr);
- if (!access_ok(uiov, sizeof(*uiov)))
- return -EFAULT;
- if (__get_user(clen, &uiov->iov_len))
- return -EFAULT;
- if (clen < 0)
- return -EINVAL;
-
- len = clen;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
- iov[0].iov_base = buf;
- iov[0].iov_len = (compat_size_t) len;
- return 0;
-}
-#endif
-
-static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- unsigned int issue_flags)
-{
- struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
- void __user *buf;
- ssize_t len;
-
- if (copy_from_user(iov, uiov, sizeof(*uiov)))
- return -EFAULT;
-
- len = iov[0].iov_len;
- if (len < 0)
- return -EINVAL;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
- iov[0].iov_base = buf;
- iov[0].iov_len = len;
- return 0;
-}
-
-static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
- unsigned int issue_flags)
-{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf = req->kbuf;
-
- iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov[0].iov_len = kbuf->len;
- return 0;
- }
- if (req->rw.len != 1)
- return -EINVAL;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return io_compat_import(req, iov, issue_flags);
-#endif
-
- return __io_iov_buffer_select(req, iov, issue_flags);
-}
-
-static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
- struct io_rw_state *s,
- unsigned int issue_flags)
-{
- struct iov_iter *iter = &s->iter;
- u8 opcode = req->opcode;
- struct iovec *iovec;
- void __user *buf;
- size_t sqe_len;
- ssize_t ret;
-
- if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(req, rw, iter);
- if (ret)
- return ERR_PTR(ret);
- return NULL;
- }
-
- /* buffer index only valid with fixed read/write, or buffer select */
- if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
- return ERR_PTR(-EINVAL);
-
- buf = u64_to_user_ptr(req->rw.addr);
- sqe_len = req->rw.len;
-
- if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
- if (IS_ERR(buf))
- return ERR_CAST(buf);
- req->rw.len = sqe_len;
- }
-
- ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
- if (ret)
- return ERR_PTR(ret);
- return NULL;
- }
-
- iovec = s->fast_iov;
- if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_iov_buffer_select(req, iovec, issue_flags);
- if (ret)
- return ERR_PTR(ret);
- iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
- return NULL;
- }
-
- ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
- req->ctx->compat);
- if (unlikely(ret < 0))
- return ERR_PTR(ret);
- return iovec;
-}
-
-static inline int io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct io_rw_state *s,
- unsigned int issue_flags)
-{
- *iovec = __io_import_iovec(rw, req, s, issue_flags);
- if (unlikely(IS_ERR(*iovec)))
- return PTR_ERR(*iovec);
-
- iov_iter_save_state(&s->iter, &s->iter_state);
- return 0;
-}
-
-static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
-{
- return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
-}
-
-/*
- * For files that don't have ->read_iter() and ->write_iter(), handle them
- * by looping over ->read() or ->write() manually.
- */
-static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
-{
- struct kiocb *kiocb = &req->rw.kiocb;
- struct file *file = req->file;
- ssize_t ret = 0;
- loff_t *ppos;
-
- /*
- * Don't support polled IO through this interface, and we can't
- * support non-blocking either. For the latter, this just causes
- * the kiocb to be handled from an async context.
- */
- if (kiocb->ki_flags & IOCB_HIPRI)
- return -EOPNOTSUPP;
- if ((kiocb->ki_flags & IOCB_NOWAIT) &&
- !(kiocb->ki_filp->f_flags & O_NONBLOCK))
- return -EAGAIN;
-
- ppos = io_kiocb_ppos(kiocb);
-
- while (iov_iter_count(iter)) {
- struct iovec iovec;
- ssize_t nr;
-
- if (!iov_iter_is_bvec(iter)) {
- iovec = iov_iter_iovec(iter);
- } else {
- iovec.iov_base = u64_to_user_ptr(req->rw.addr);
- iovec.iov_len = req->rw.len;
- }
-
- if (rw == READ) {
- nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, ppos);
- } else {
- nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, ppos);
- }
-
- if (nr < 0) {
- if (!ret)
- ret = nr;
- break;
- }
- ret += nr;
- if (!iov_iter_is_bvec(iter)) {
- iov_iter_advance(iter, nr);
- } else {
- req->rw.addr += nr;
- req->rw.len -= nr;
- if (!req->rw.len)
- break;
- }
- if (nr != iovec.iov_len)
- break;
- }
-
- return ret;
-}
-
-static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
- const struct iovec *fast_iov, struct iov_iter *iter)
-{
- struct io_async_rw *rw = req->async_data;
-
- memcpy(&rw->s.iter, iter, sizeof(*iter));
- rw->free_iovec = iovec;
- rw->bytes_done = 0;
- /* can only be fixed buffers, no need to do anything */
- if (iov_iter_is_bvec(iter))
- return;
- if (!iovec) {
- unsigned iov_off = 0;
-
- rw->s.iter.iov = rw->s.fast_iov;
- if (iter->iov != fast_iov) {
- iov_off = iter->iov - fast_iov;
- rw->s.iter.iov += iov_off;
- }
- if (rw->s.fast_iov != fast_iov)
- memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
- sizeof(struct iovec) * iter->nr_segs);
- } else {
- req->flags |= REQ_F_NEED_CLEANUP;
- }
-}
-
-static inline bool io_alloc_async_data(struct io_kiocb *req)
-{
- WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
- req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
- if (req->async_data) {
- req->flags |= REQ_F_ASYNC_DATA;
- return false;
- }
- return true;
-}
-
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
- struct io_rw_state *s, bool force)
-{
- if (!force && !io_op_defs[req->opcode].needs_async_setup)
- return 0;
- if (!req_has_async_data(req)) {
- struct io_async_rw *iorw;
-
- if (io_alloc_async_data(req)) {
- kfree(iovec);
- return -ENOMEM;
- }
-
- io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
- iorw = req->async_data;
- /* we've copied and mapped the iter, ensure state is saved */
- iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
- }
- return 0;
-}
-
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
-{
- struct io_async_rw *iorw = req->async_data;
- struct iovec *iov;
- int ret;
-
- /* submission path, ->uring_lock should already be taken */
- ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
- if (unlikely(ret < 0))
- return ret;
-
- iorw->bytes_done = 0;
- iorw->free_iovec = iov;
- if (iov)
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_READ)))
- return -EBADF;
- return io_prep_rw(req, sqe);
-}
-
-/*
- * This is our waitqueue callback handler, registered through __folio_lock_async()
- * when we initially tried to do the IO with the iocb armed our waitqueue.
- * This gets called when the page is unlocked, and we generally expect that to
- * happen when the page IO is completed and the page is now uptodate. This will
- * queue a task_work based retry of the operation, attempting to copy the data
- * again. If the latter fails because the page was NOT uptodate, then we will
- * do a thread based blocking retry of the operation. That's the unexpected
- * slow path.
- */
-static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
- int sync, void *arg)
-{
- struct wait_page_queue *wpq;
- struct io_kiocb *req = wait->private;
- struct wait_page_key *key = arg;
-
- wpq = container_of(wait, struct wait_page_queue, wait);
-
- if (!wake_page_match(wpq, key))
- return 0;
-
- req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
- list_del_init(&wait->entry);
- io_req_task_queue(req);
- return 1;
-}
-
-/*
- * This controls whether a given IO request should be armed for async page
- * based retry. If we return false here, the request is handed to the async
- * worker threads for retry. If we're doing buffered reads on a regular file,
- * we prepare a private wait_page_queue entry and retry the operation. This
- * will either succeed because the page is now uptodate and unlocked, or it
- * will register a callback when the page is unlocked at IO completion. Through
- * that callback, io_uring uses task_work to setup a retry of the operation.
- * That retry will attempt the buffered read again. The retry will generally
- * succeed, or in rare cases where it fails, we then fall back to using the
- * async worker threads for a blocking retry.
- */
-static bool io_rw_should_retry(struct io_kiocb *req)
-{
- struct io_async_rw *rw = req->async_data;
- struct wait_page_queue *wait = &rw->wpq;
- struct kiocb *kiocb = &req->rw.kiocb;
-
- /* never retry for NOWAIT, we just complete with -EAGAIN */
- if (req->flags & REQ_F_NOWAIT)
- return false;
-
- /* Only for buffered IO */
- if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
- return false;
-
- /*
- * just use poll if we can, and don't attempt if the fs doesn't
- * support callback based unlocks
- */
- if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
- return false;
-
- wait->wait.func = io_async_buf_func;
- wait->wait.private = req;
- wait->wait.flags = 0;
- INIT_LIST_HEAD(&wait->wait.entry);
- kiocb->ki_flags |= IOCB_WAITQ;
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- kiocb->ki_waitq = wait;
- return true;
-}
-
-static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
-{
- if (likely(req->file->f_op->read_iter))
- return call_read_iter(req->file, &req->rw.kiocb, iter);
- else if (req->file->f_op->read)
- return loop_rw_iter(READ, req, iter);
- else
- return -EINVAL;
-}
-
-static bool need_read_all(struct io_kiocb *req)
-{
- return req->flags & REQ_F_ISREG ||
- S_ISBLK(file_inode(req->file)->i_mode);
-}
-
-static int io_read(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_rw_state __s, *s = &__s;
- struct iovec *iovec;
- struct kiocb *kiocb = &req->rw.kiocb;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- struct io_async_rw *rw;
- ssize_t ret, ret2;
- loff_t *ppos;
-
- if (!req_has_async_data(req)) {
- ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- } else {
- /*
- * Safe and required to re-import if we're using provided
- * buffers, as we dropped the selected one before retry.
- */
- if (req->flags & REQ_F_BUFFER_SELECT) {
- ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- }
-
- rw = req->async_data;
- s = &rw->s;
- /*
- * We come here from an earlier attempt, restore our state to
- * match in case it doesn't. It's cheap enough that we don't
- * need to make this conditional.
- */
- iov_iter_restore(&s->iter, &s->iter_state);
- iovec = NULL;
- }
- req->result = iov_iter_count(&s->iter);
-
- if (force_nonblock) {
- /* If the file doesn't support async, just async punt */
- if (unlikely(!io_file_supports_nowait(req))) {
- ret = io_setup_async_rw(req, iovec, s, true);
- return ret ?: -EAGAIN;
- }
- kiocb->ki_flags |= IOCB_NOWAIT;
- } else {
- /* Ensure we clear previously set non-block flag */
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- }
-
- ppos = io_kiocb_update_pos(req);
-
- ret = rw_verify_area(READ, req->file, ppos, req->result);
- if (unlikely(ret)) {
- kfree(iovec);
- return ret;
- }
-
- ret = io_iter_do_read(req, &s->iter);
-
- if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
- req->flags &= ~REQ_F_REISSUE;
- /* if we can poll, just do that */
- if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
- return -EAGAIN;
- /* IOPOLL retry should happen for io-wq threads */
- if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
- goto done;
- /* no retry on NONBLOCK nor RWF_NOWAIT */
- if (req->flags & REQ_F_NOWAIT)
- goto done;
- ret = 0;
- } else if (ret == -EIOCBQUEUED) {
- goto out_free;
- } else if (ret == req->result || ret <= 0 || !force_nonblock ||
- (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
- /* read all, failed, already did sync or don't want to retry */
- goto done;
- }
-
- /*
- * Don't depend on the iter state matching what was consumed, or being
- * untouched in case of error. Restore it and we'll advance it
- * manually if we need to.
- */
- iov_iter_restore(&s->iter, &s->iter_state);
-
- ret2 = io_setup_async_rw(req, iovec, s, true);
- if (ret2)
- return ret2;
-
- iovec = NULL;
- rw = req->async_data;
- s = &rw->s;
- /*
- * Now use our persistent iterator and state, if we aren't already.
- * We've restored and mapped the iter to match.
- */
-
- do {
- /*
- * We end up here because of a partial read, either from
- * above or inside this loop. Advance the iter by the bytes
- * that were consumed.
- */
- iov_iter_advance(&s->iter, ret);
- if (!iov_iter_count(&s->iter))
- break;
- rw->bytes_done += ret;
- iov_iter_save_state(&s->iter, &s->iter_state);
-
- /* if we can retry, do so with the callbacks armed */
- if (!io_rw_should_retry(req)) {
- kiocb->ki_flags &= ~IOCB_WAITQ;
- return -EAGAIN;
- }
-
- /*
- * Now retry read with the IOCB_WAITQ parts set in the iocb. If
- * we get -EIOCBQUEUED, then we'll get a notification when the
- * desired page gets unlocked. We can also get a partial read
- * here, and if we do, then just retry at the new offset.
- */
- ret = io_iter_do_read(req, &s->iter);
- if (ret == -EIOCBQUEUED)
- return 0;
- /* we got some bytes, but not all. retry. */
- kiocb->ki_flags &= ~IOCB_WAITQ;
- iov_iter_restore(&s->iter, &s->iter_state);
- } while (ret > 0);
-done:
- kiocb_done(req, ret, issue_flags);
-out_free:
- /* it's faster to check here then delegate to kfree */
- if (iovec)
- kfree(iovec);
- return 0;
-}
-
-static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
- return -EBADF;
- return io_prep_rw(req, sqe);
-}
-
-static int io_write(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_rw_state __s, *s = &__s;
- struct iovec *iovec;
- struct kiocb *kiocb = &req->rw.kiocb;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- ssize_t ret, ret2;
- loff_t *ppos;
-
- if (!req_has_async_data(req)) {
- ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
- if (unlikely(ret < 0))
- return ret;
- } else {
- struct io_async_rw *rw = req->async_data;
-
- s = &rw->s;
- iov_iter_restore(&s->iter, &s->iter_state);
- iovec = NULL;
- }
- req->result = iov_iter_count(&s->iter);
-
- if (force_nonblock) {
- /* If the file doesn't support async, just async punt */
- if (unlikely(!io_file_supports_nowait(req)))
- goto copy_iov;
-
- /* file path doesn't support NOWAIT for non-direct_IO */
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
- (req->flags & REQ_F_ISREG))
- goto copy_iov;
-
- kiocb->ki_flags |= IOCB_NOWAIT;
- } else {
- /* Ensure we clear previously set non-block flag */
- kiocb->ki_flags &= ~IOCB_NOWAIT;
- }
-
- ppos = io_kiocb_update_pos(req);
-
- ret = rw_verify_area(WRITE, req->file, ppos, req->result);
- if (unlikely(ret))
- goto out_free;
-
- /*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * io_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
- */
- if (req->flags & REQ_F_ISREG) {
- sb_start_write(file_inode(req->file)->i_sb);
- __sb_writers_release(file_inode(req->file)->i_sb,
- SB_FREEZE_WRITE);
- }
- kiocb->ki_flags |= IOCB_WRITE;
-
- if (likely(req->file->f_op->write_iter))
- ret2 = call_write_iter(req->file, kiocb, &s->iter);
- else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req, &s->iter);
- else
- ret2 = -EINVAL;
-
- if (req->flags & REQ_F_REISSUE) {
- req->flags &= ~REQ_F_REISSUE;
- ret2 = -EAGAIN;
- }
-
- /*
- * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
- * retry them without IOCB_NOWAIT.
- */
- if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
- ret2 = -EAGAIN;
- /* no retry on NONBLOCK nor RWF_NOWAIT */
- if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
- goto done;
- if (!force_nonblock || ret2 != -EAGAIN) {
- /* IOPOLL retry should happen for io-wq threads */
- if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
- goto copy_iov;
-done:
- kiocb_done(req, ret2, issue_flags);
- } else {
-copy_iov:
- iov_iter_restore(&s->iter, &s->iter_state);
- ret = io_setup_async_rw(req, iovec, s, false);
- return ret ?: -EAGAIN;
- }
-out_free:
- /* it's reportedly faster than delegating the null check to kfree() */
- if (iovec)
- kfree(iovec);
- return ret;
-}
-
-static int io_renameat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_rename *ren = &req->rename;
- const char __user *oldf, *newf;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- ren->old_dfd = READ_ONCE(sqe->fd);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- ren->new_dfd = READ_ONCE(sqe->len);
- ren->flags = READ_ONCE(sqe->rename_flags);
-
- ren->oldpath = getname(oldf);
- if (IS_ERR(ren->oldpath))
- return PTR_ERR(ren->oldpath);
-
- ren->newpath = getname(newf);
- if (IS_ERR(ren->newpath)) {
- putname(ren->oldpath);
- return PTR_ERR(ren->newpath);
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_rename *ren = &req->rename;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
- ren->newpath, ren->flags);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_unlinkat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_unlink *un = &req->unlink;
- const char __user *fname;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- un->dfd = READ_ONCE(sqe->fd);
-
- un->flags = READ_ONCE(sqe->unlink_flags);
- if (un->flags & ~AT_REMOVEDIR)
- return -EINVAL;
-
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- un->filename = getname(fname);
- if (IS_ERR(un->filename))
- return PTR_ERR(un->filename);
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_unlink *un = &req->unlink;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- if (un->flags & AT_REMOVEDIR)
- ret = do_rmdir(un->dfd, un->filename);
- else
- ret = do_unlinkat(un->dfd, un->filename);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_mkdirat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_mkdir *mkd = &req->mkdir;
- const char __user *fname;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- mkd->dfd = READ_ONCE(sqe->fd);
- mkd->mode = READ_ONCE(sqe->len);
-
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- mkd->filename = getname(fname);
- if (IS_ERR(mkd->filename))
- return PTR_ERR(mkd->filename);
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_mkdir *mkd = &req->mkdir;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_symlinkat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_symlink *sl = &req->symlink;
- const char __user *oldpath, *newpath;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- sl->new_dfd = READ_ONCE(sqe->fd);
- oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
-
- sl->oldpath = getname(oldpath);
- if (IS_ERR(sl->oldpath))
- return PTR_ERR(sl->oldpath);
-
- sl->newpath = getname(newpath);
- if (IS_ERR(sl->newpath)) {
- putname(sl->oldpath);
- return PTR_ERR(sl->newpath);
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_symlink *sl = &req->symlink;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_linkat_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_hardlink *lnk = &req->hardlink;
- const char __user *oldf, *newf;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- lnk->old_dfd = READ_ONCE(sqe->fd);
- lnk->new_dfd = READ_ONCE(sqe->len);
- oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
- newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- lnk->flags = READ_ONCE(sqe->hardlink_flags);
-
- lnk->oldpath = getname(oldf);
- if (IS_ERR(lnk->oldpath))
- return PTR_ERR(lnk->oldpath);
-
- lnk->newpath = getname(newf);
- if (IS_ERR(lnk->newpath)) {
- putname(lnk->oldpath);
- return PTR_ERR(lnk->newpath);
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_hardlink *lnk = &req->hardlink;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
- lnk->newpath, lnk->flags);
-
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_shutdown_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
-#if defined(CONFIG_NET)
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
- sqe->buf_index || sqe->splice_fd_in))
- return -EINVAL;
-
- req->shutdown.how = READ_ONCE(sqe->len);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
-{
-#if defined(CONFIG_NET)
- struct socket *sock;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- ret = __sys_shutdown_sock(sock, req->shutdown.how);
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int __io_splice_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_splice *sp = &req->splice;
- unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- sp->file_in = NULL;
- sp->len = READ_ONCE(sqe->len);
- sp->flags = READ_ONCE(sqe->splice_flags);
-
- if (unlikely(sp->flags & ~valid_flags))
- return -EINVAL;
-
- sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
- (sp->flags & SPLICE_F_FD_IN_FIXED));
- if (!sp->file_in)
- return -EBADF;
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_tee_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
- return -EINVAL;
- return __io_splice_prep(req, sqe);
-}
-
-static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
- struct file *out = sp->file_out;
- unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
- long ret = 0;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
- if (sp->len)
- ret = do_tee(in, out, sp->len, flags);
-
- if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
- if (ret != sp->len)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_splice *sp = &req->splice;
-
- sp->off_in = READ_ONCE(sqe->splice_off_in);
- sp->off_out = READ_ONCE(sqe->off);
- return __io_splice_prep(req, sqe);
-}
-
-static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_splice *sp = &req->splice;
- struct file *in = sp->file_in;
- struct file *out = sp->file_out;
- unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
- loff_t *poff_in, *poff_out;
- long ret = 0;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
- poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
-
- if (sp->len)
- ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
-
- if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(in);
- req->flags &= ~REQ_F_NEED_CLEANUP;
-
- if (ret != sp->len)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-/*
- * IORING_OP_NOP just posts a completion event, nothing else.
- */
-static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- __io_req_complete(req, issue_flags, 0, 0);
- return 0;
-}
-
-static int io_msg_ring_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
- sqe->splice_fd_in || sqe->buf_index || sqe->personality))
- return -EINVAL;
-
- req->msg.user_data = READ_ONCE(sqe->off);
- req->msg.len = READ_ONCE(sqe->len);
- return 0;
-}
-
-static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *target_ctx;
- struct io_msg *msg = &req->msg;
- bool filled;
- int ret;
-
- ret = -EBADFD;
- if (req->file->f_op != &io_uring_fops)
- goto done;
-
- ret = -EOVERFLOW;
- target_ctx = req->file->private_data;
-
- spin_lock(&target_ctx->completion_lock);
- filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
- io_commit_cqring(target_ctx);
- spin_unlock(&target_ctx->completion_lock);
-
- if (filled) {
- io_cqring_ev_posted(target_ctx);
- ret = 0;
- }
-
-done:
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!req->file)
- return -EBADF;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
- return -EINVAL;
-
- req->sync.flags = READ_ONCE(sqe->fsync_flags);
- if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
- return -EINVAL;
-
- req->sync.off = READ_ONCE(sqe->off);
- req->sync.len = READ_ONCE(sqe->len);
- return 0;
-}
-
-static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
-{
- loff_t end = req->sync.off + req->sync.len;
- int ret;
-
- /* fsync always requires a blocking context */
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = vfs_fsync_range(req->file, req->sync.off,
- end > 0 ? end : LLONG_MAX,
- req->sync.flags & IORING_FSYNC_DATASYNC);
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_fallocate_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- req->sync.off = READ_ONCE(sqe->off);
- req->sync.len = READ_ONCE(sqe->addr);
- req->sync.mode = READ_ONCE(sqe->len);
- return 0;
-}
-
-static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
-{
- int ret;
-
- /* fallocate always requiring blocking context */
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
- ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
- req->sync.len);
- if (ret < 0)
- req_set_fail(req);
- else
- fsnotify_modify(req->file);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- const char __user *fname;
- int ret;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->buf_index))
- return -EINVAL;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
- /* open.how should be already initialised */
- if (!(req->open.how.flags & O_PATH) && force_o_largefile())
- req->open.how.flags |= O_LARGEFILE;
-
- req->open.dfd = READ_ONCE(sqe->fd);
- fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->open.filename = getname(fname);
- if (IS_ERR(req->open.filename)) {
- ret = PTR_ERR(req->open.filename);
- req->open.filename = NULL;
- return ret;
- }
-
- req->open.file_slot = READ_ONCE(sqe->file_index);
- if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
- return -EINVAL;
-
- req->open.nofile = rlimit(RLIMIT_NOFILE);
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- u64 mode = READ_ONCE(sqe->len);
- u64 flags = READ_ONCE(sqe->open_flags);
-
- req->open.how = build_open_how(flags, mode);
- return __io_openat_prep(req, sqe);
-}
-
-static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct open_how __user *how;
- size_t len;
- int ret;
-
- how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- len = READ_ONCE(sqe->len);
- if (len < OPEN_HOW_SIZE_VER0)
- return -EINVAL;
-
- ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
- len);
- if (ret)
- return ret;
-
- return __io_openat_prep(req, sqe);
-}
-
-static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct open_flags op;
- struct file *file;
- bool resolve_nonblock, nonblock_set;
- bool fixed = !!req->open.file_slot;
- int ret;
-
- ret = build_open_flags(&req->open.how, &op);
- if (ret)
- goto err;
- nonblock_set = op.open_flag & O_NONBLOCK;
- resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
- if (issue_flags & IO_URING_F_NONBLOCK) {
- /*
- * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
- * it'll always -EAGAIN
- */
- if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
- return -EAGAIN;
- op.lookup_flags |= LOOKUP_CACHED;
- op.open_flag |= O_NONBLOCK;
- }
-
- if (!fixed) {
- ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
- if (ret < 0)
- goto err;
- }
-
- file = do_filp_open(req->open.dfd, req->open.filename, &op);
- if (IS_ERR(file)) {
- /*
- * We could hang on to this 'fd' on retrying, but seems like
- * marginal gain for something that is now known to be a slower
- * path. So just put it, and we'll get a new one when we retry.
- */
- if (!fixed)
- put_unused_fd(ret);
-
- ret = PTR_ERR(file);
- /* only retry if RESOLVE_CACHED wasn't already set by application */
- if (ret == -EAGAIN &&
- (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
- return -EAGAIN;
- goto err;
- }
-
- if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
- file->f_flags &= ~O_NONBLOCK;
- fsnotify_open(file);
-
- if (!fixed)
- fd_install(ret, file);
- else
- ret = io_install_fixed_file(req, file, issue_flags,
- req->open.file_slot - 1);
-err:
- putname(req->open.filename);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
-{
- return io_openat2(req, issue_flags);
-}
-
-static int io_remove_buffers_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_provide_buf *p = &req->pbuf;
- u64 tmp;
-
- if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
- sqe->splice_fd_in)
- return -EINVAL;
-
- tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
- return -EINVAL;
-
- memset(p, 0, sizeof(*p));
- p->nbufs = tmp;
- p->bgid = READ_ONCE(sqe->buf_group);
- return 0;
-}
-
-static int __io_remove_buffers(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned nbufs)
-{
- unsigned i = 0;
-
- /* shouldn't happen */
- if (!nbufs)
- return 0;
-
- /* the head kbuf is the list itself */
- while (!list_empty(&bl->buf_list)) {
- struct io_buffer *nxt;
-
- nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
- list_del(&nxt->list);
- if (++i == nbufs)
- return i;
- cond_resched();
- }
- i++;
-
- return i;
-}
-
-static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_provide_buf *p = &req->pbuf;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
-
- io_ring_submit_lock(ctx, needs_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
-
- ret = -ENOENT;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (bl)
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
- if (ret < 0)
- req_set_fail(req);
-
- /* complete before unlock, IOPOLL may need the lock */
- __io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, needs_lock);
- return 0;
-}
-
-static int io_provide_buffers_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- unsigned long size, tmp_check;
- struct io_provide_buf *p = &req->pbuf;
- u64 tmp;
-
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
- return -EINVAL;
-
- tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
- return -E2BIG;
- p->nbufs = tmp;
- p->addr = READ_ONCE(sqe->addr);
- p->len = READ_ONCE(sqe->len);
-
- if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
- &size))
- return -EOVERFLOW;
- if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
- return -EOVERFLOW;
-
- size = (unsigned long)p->len * p->nbufs;
- if (!access_ok(u64_to_user_ptr(p->addr), size))
- return -EFAULT;
-
- p->bgid = READ_ONCE(sqe->buf_group);
- tmp = READ_ONCE(sqe->off);
- if (tmp > USHRT_MAX)
- return -E2BIG;
- p->bid = tmp;
- return 0;
-}
-
-static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
-{
- struct io_buffer *buf;
- struct page *page;
- int bufs_in_page;
-
- /*
- * Completions that don't happen inline (eg not under uring_lock) will
- * add to ->io_buffers_comp. If we don't have any free buffers, check
- * the completion list and splice those entries first.
- */
- if (!list_empty_careful(&ctx->io_buffers_comp)) {
- spin_lock(&ctx->completion_lock);
- if (!list_empty(&ctx->io_buffers_comp)) {
- list_splice_init(&ctx->io_buffers_comp,
- &ctx->io_buffers_cache);
- spin_unlock(&ctx->completion_lock);
- return 0;
- }
- spin_unlock(&ctx->completion_lock);
- }
-
- /*
- * No free buffers and no completion entries either. Allocate a new
- * page worth of buffer entries and add those to our freelist.
- */
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return -ENOMEM;
-
- list_add(&page->lru, &ctx->io_buffers_pages);
-
- buf = page_address(page);
- bufs_in_page = PAGE_SIZE / sizeof(*buf);
- while (bufs_in_page) {
- list_add_tail(&buf->list, &ctx->io_buffers_cache);
- buf++;
- bufs_in_page--;
- }
-
- return 0;
-}
-
-static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
- struct io_buffer_list *bl)
-{
- struct io_buffer *buf;
- u64 addr = pbuf->addr;
- int i, bid = pbuf->bid;
-
- for (i = 0; i < pbuf->nbufs; i++) {
- if (list_empty(&ctx->io_buffers_cache) &&
- io_refill_buffer_cache(ctx))
- break;
- buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
- list);
- list_move_tail(&buf->list, &bl->buf_list);
- buf->addr = addr;
- buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
- buf->bid = bid;
- buf->bgid = pbuf->bgid;
- addr += pbuf->len;
- bid++;
- cond_resched();
- }
-
- return i ? 0 : -ENOMEM;
-}
-
-static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_provide_buf *p = &req->pbuf;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
-
- io_ring_submit_lock(ctx, needs_lock);
-
- lockdep_assert_held(&ctx->uring_lock);
-
- bl = io_buffer_get_list(ctx, p->bgid);
- if (unlikely(!bl)) {
- bl = kmalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl) {
- ret = -ENOMEM;
- goto err;
- }
- io_buffer_add_list(ctx, bl, p->bgid);
- }
-
- ret = io_add_buffers(ctx, p, bl);
-err:
- if (ret < 0)
- req_set_fail(req);
- /* complete before unlock, IOPOLL may need the lock */
- __io_req_complete(req, issue_flags, ret, 0);
- io_ring_submit_unlock(ctx, needs_lock);
- return 0;
-}
-
-static int io_epoll_ctl_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
-#if defined(CONFIG_EPOLL)
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- req->epoll.epfd = READ_ONCE(sqe->fd);
- req->epoll.op = READ_ONCE(sqe->len);
- req->epoll.fd = READ_ONCE(sqe->off);
-
- if (ep_op_has_event(req->epoll.op)) {
- struct epoll_event __user *ev;
-
- ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
- if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
- return -EFAULT;
- }
-
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
-{
-#if defined(CONFIG_EPOLL)
- struct io_epoll *ie = &req->epoll;
- int ret;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
- if (force_nonblock && ret == -EAGAIN)
- return -EAGAIN;
-
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- req->madvise.addr = READ_ONCE(sqe->addr);
- req->madvise.len = READ_ONCE(sqe->len);
- req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
-{
-#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- struct io_madvise *ma = &req->madvise;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-#else
- return -EOPNOTSUPP;
-#endif
-}
-
-static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- req->fadvise.offset = READ_ONCE(sqe->off);
- req->fadvise.len = READ_ONCE(sqe->len);
- req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
- return 0;
-}
-
-static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_fadvise *fa = &req->fadvise;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK) {
- switch (fa->advice) {
- case POSIX_FADV_NORMAL:
- case POSIX_FADV_RANDOM:
- case POSIX_FADV_SEQUENTIAL:
- break;
- default:
- return -EAGAIN;
- }
- }
-
- ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- const char __user *path;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
- return -EBADF;
-
- req->statx.dfd = READ_ONCE(sqe->fd);
- req->statx.mask = READ_ONCE(sqe->len);
- path = u64_to_user_ptr(READ_ONCE(sqe->addr));
- req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- req->statx.flags = READ_ONCE(sqe->statx_flags);
-
- req->statx.filename = getname_flags(path,
- getname_statx_lookup_flags(req->statx.flags),
- NULL);
-
- if (IS_ERR(req->statx.filename)) {
- int ret = PTR_ERR(req->statx.filename);
-
- req->statx.filename = NULL;
- return ret;
- }
-
- req->flags |= REQ_F_NEED_CLEANUP;
- return 0;
-}
-
-static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_statx *ctx = &req->statx;
- int ret;
-
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
- ctx->buffer);
-
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- sqe->rw_flags || sqe->buf_index)
- return -EINVAL;
- if (req->flags & REQ_F_FIXED_FILE)
- return -EBADF;
-
- req->close.fd = READ_ONCE(sqe->fd);
- req->close.file_slot = READ_ONCE(sqe->file_index);
- if (req->close.file_slot && req->close.fd)
- return -EINVAL;
-
- return 0;
-}
-
-static int io_close(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct files_struct *files = current->files;
- struct io_close *close = &req->close;
- struct fdtable *fdt;
- struct file *file = NULL;
- int ret = -EBADF;
-
- if (req->close.file_slot) {
- ret = io_close_fixed(req, issue_flags);
- goto err;
- }
-
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (close->fd >= fdt->max_fds) {
- spin_unlock(&files->file_lock);
- goto err;
- }
- file = fdt->fd[close->fd];
- if (!file || file->f_op == &io_uring_fops) {
- spin_unlock(&files->file_lock);
- file = NULL;
- goto err;
- }
-
- /* if the file has a flush method, be safe and punt to async */
- if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
- spin_unlock(&files->file_lock);
- return -EAGAIN;
- }
-
- ret = __close_fd_get_file(close->fd, &file);
- spin_unlock(&files->file_lock);
- if (ret < 0) {
- if (ret == -ENOENT)
- ret = -EBADF;
- goto err;
- }
-
- /* No ->flush() or already async, safely close from here */
- ret = filp_close(file, current->files);
-err:
- if (ret < 0)
- req_set_fail(req);
- if (file)
- fput(file);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
- return -EINVAL;
-
- req->sync.off = READ_ONCE(sqe->off);
- req->sync.len = READ_ONCE(sqe->len);
- req->sync.flags = READ_ONCE(sqe->sync_range_flags);
- return 0;
-}
-
-static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
-{
- int ret;
-
- /* sync_file_range always requires a blocking context */
- if (issue_flags & IO_URING_F_NONBLOCK)
- return -EAGAIN;
-
- ret = sync_file_range(req->file, req->sync.off, req->sync.len,
- req->sync.flags);
- if (ret < 0)
- req_set_fail(req);
- io_req_complete(req, ret);
- return 0;
-}
-
-#if defined(CONFIG_NET)
-static int io_setup_async_msg(struct io_kiocb *req,
- struct io_async_msghdr *kmsg)
-{
- struct io_async_msghdr *async_msg = req->async_data;
-
- if (async_msg)
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- kfree(kmsg->free_iov);
- return -ENOMEM;
- }
- async_msg = req->async_data;
- req->flags |= REQ_F_NEED_CLEANUP;
- memcpy(async_msg, kmsg, sizeof(*kmsg));
- async_msg->msg.msg_name = &async_msg->addr;
- /* if were using fast_iov, set it to the new one */
- if (!async_msg->free_iov)
- async_msg->msg.msg_iter.iov = async_msg->fast_iov;
-
- return -EAGAIN;
-}
-
-static int io_sendmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->free_iov = iomsg->fast_iov;
- return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
- req->sr_msg.msg_flags, &iomsg->free_iov);
-}
-
-static int io_sendmsg_prep_async(struct io_kiocb *req)
-{
- int ret;
-
- ret = io_sendmsg_copy_hdr(req, req->async_data);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
-}
-
-static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->len = READ_ONCE(sqe->len);
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
- if (sr->msg_flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- sr->msg_flags |= MSG_CMSG_COMPAT;
-#endif
- return 0;
-}
-
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_async_msghdr iomsg, *kmsg;
- struct socket *sock;
- unsigned flags;
- int min_ret = 0;
- int ret;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- } else {
- ret = io_sendmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
- flags = req->sr_msg.msg_flags;
- if (issue_flags & IO_URING_F_NONBLOCK)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&kmsg->msg.msg_iter);
-
- ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
-
- if (ret < min_ret) {
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- req_set_fail(req);
- }
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_send(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_sr_msg *sr = &req->sr_msg;
- struct msghdr msg;
- struct iovec iov;
- struct socket *sock;
- unsigned flags;
- int min_ret = 0;
- int ret;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
- if (unlikely(ret))
- return ret;
-
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
-
- flags = req->sr_msg.msg_flags;
- if (issue_flags & IO_URING_F_NONBLOCK)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
-
- msg.msg_flags = flags;
- ret = sock_sendmsg(sock, &msg);
- if (ret < min_ret) {
- if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- req_set_fail(req);
- }
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- struct io_sr_msg *sr = &req->sr_msg;
- struct iovec __user *uiov;
- size_t iov_len;
- int ret;
-
- ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
- &iomsg->uaddr, &uiov, &iov_len);
- if (ret)
- return ret;
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- if (iov_len > 1)
- return -EINVAL;
- if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
- return -EFAULT;
- sr->len = iomsg->fast_iov[0].iov_len;
- iomsg->free_iov = NULL;
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &iomsg->free_iov, &iomsg->msg.msg_iter,
- false);
- if (ret > 0)
- ret = 0;
- }
-
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- struct io_sr_msg *sr = &req->sr_msg;
- struct compat_iovec __user *uiov;
- compat_uptr_t ptr;
- compat_size_t len;
- int ret;
-
- ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
- &ptr, &len);
- if (ret)
- return ret;
-
- uiov = compat_ptr(ptr);
- if (req->flags & REQ_F_BUFFER_SELECT) {
- compat_ssize_t clen;
-
- if (len > 1)
- return -EINVAL;
- if (!access_ok(uiov, sizeof(*uiov)))
- return -EFAULT;
- if (__get_user(clen, &uiov->iov_len))
- return -EFAULT;
- if (clen < 0)
- return -EINVAL;
- sr->len = clen;
- iomsg->free_iov = NULL;
- } else {
- iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
- UIO_FASTIOV, &iomsg->free_iov,
- &iomsg->msg.msg_iter, true);
- if (ret < 0)
- return ret;
- }
-
- return 0;
-}
-#endif
-
-static int io_recvmsg_copy_hdr(struct io_kiocb *req,
- struct io_async_msghdr *iomsg)
-{
- iomsg->msg.msg_name = &iomsg->addr;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return __io_compat_recvmsg_copy_hdr(req, iomsg);
-#endif
-
- return __io_recvmsg_copy_hdr(req, iomsg);
-}
-
-static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
-}
-
-static int io_recvmsg_prep_async(struct io_kiocb *req)
-{
- int ret;
-
- ret = io_recvmsg_copy_hdr(req, req->async_data);
- if (!ret)
- req->flags |= REQ_F_NEED_CLEANUP;
- return ret;
-}
-
-static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- sr->len = READ_ONCE(sqe->len);
- sr->bgid = READ_ONCE(sqe->buf_group);
- sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
- if (sr->msg_flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
-
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- sr->msg_flags |= MSG_CMSG_COMPAT;
-#endif
- sr->done_io = 0;
- return 0;
-}
-
-static bool io_net_retry(struct socket *sock, int flags)
-{
- if (!(flags & MSG_WAITALL))
- return false;
- return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
-}
-
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_async_msghdr iomsg, *kmsg;
- struct io_sr_msg *sr = &req->sr_msg;
- struct socket *sock;
- struct io_buffer *kbuf;
- unsigned flags;
- int ret, min_ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- if (req_has_async_data(req)) {
- kmsg = req->async_data;
- } else {
- ret = io_recvmsg_copy_hdr(req, &iomsg);
- if (ret)
- return ret;
- kmsg = &iomsg;
- }
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- kmsg->fast_iov[0].iov_len = req->sr_msg.len;
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
- 1, req->sr_msg.len);
- }
-
- flags = req->sr_msg.msg_flags;
- if (force_nonblock)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&kmsg->msg.msg_iter);
-
- ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
- kmsg->uaddr, flags);
- if (ret < min_ret) {
- if (ret == -EAGAIN && force_nonblock)
- return io_setup_async_msg(req, kmsg);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret > 0 && io_net_retry(sock, flags)) {
- sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
- return io_setup_async_msg(req, kmsg);
- }
- req_set_fail(req);
- } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
- req_set_fail(req);
- }
-
- /* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret >= 0)
- ret += sr->done_io;
- else if (sr->done_io)
- ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
- return 0;
-}
-
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_buffer *kbuf;
- struct io_sr_msg *sr = &req->sr_msg;
- struct msghdr msg;
- void __user *buf = sr->buf;
- struct socket *sock;
- struct iovec iov;
- unsigned flags;
- int ret, min_ret = 0;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- sock = sock_from_file(req->file);
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- buf = u64_to_user_ptr(kbuf->addr);
- }
-
- ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
- if (unlikely(ret))
- goto out_free;
-
- msg.msg_name = NULL;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_namelen = 0;
- msg.msg_iocb = NULL;
- msg.msg_flags = 0;
-
- flags = req->sr_msg.msg_flags;
- if (force_nonblock)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
- min_ret = iov_iter_count(&msg.msg_iter);
-
- ret = sock_recvmsg(sock, &msg, flags);
- if (ret < min_ret) {
- if (ret == -EAGAIN && force_nonblock)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- if (ret > 0 && io_net_retry(sock, flags)) {
- sr->len -= ret;
- sr->buf += ret;
- sr->done_io += ret;
- req->flags |= REQ_F_PARTIAL_IO;
- return -EAGAIN;
- }
- req_set_fail(req);
- } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
-out_free:
- req_set_fail(req);
- }
-
- if (ret >= 0)
- ret += sr->done_io;
- else if (sr->done_io)
- ret = sr->done_io;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
- return 0;
-}
-
-static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_accept *accept = &req->accept;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index)
- return -EINVAL;
-
- accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- accept->flags = READ_ONCE(sqe->accept_flags);
- accept->nofile = rlimit(RLIMIT_NOFILE);
-
- accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
- return -EINVAL;
- if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
- return -EINVAL;
- if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
- accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
- return 0;
-}
-
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_accept *accept = &req->accept;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
- bool fixed = !!accept->file_slot;
- struct file *file;
- int ret, fd;
-
- if (!fixed) {
- fd = __get_unused_fd_flags(accept->flags, accept->nofile);
- if (unlikely(fd < 0))
- return fd;
- }
- file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
- accept->flags);
- if (IS_ERR(file)) {
- if (!fixed)
- put_unused_fd(fd);
- ret = PTR_ERR(file);
- if (ret == -EAGAIN && force_nonblock)
- return -EAGAIN;
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
- req_set_fail(req);
- } else if (!fixed) {
- fd_install(fd, file);
- ret = fd;
- } else {
- ret = io_install_fixed_file(req, file, issue_flags,
- accept->file_slot - 1);
- }
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_connect_prep_async(struct io_kiocb *req)
-{
- struct io_async_connect *io = req->async_data;
- struct io_connect *conn = &req->connect;
-
- return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
-}
-
-static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_connect *conn = &req->connect;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
- return -EINVAL;
-
- conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
- conn->addr_len = READ_ONCE(sqe->addr2);
- return 0;
-}
-
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_async_connect __io, *io;
- unsigned file_flags;
- int ret;
- bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-
- if (req_has_async_data(req)) {
- io = req->async_data;
- } else {
- ret = move_addr_to_kernel(req->connect.addr,
- req->connect.addr_len,
- &__io.address);
- if (ret)
- goto out;
- io = &__io;
- }
-
- file_flags = force_nonblock ? O_NONBLOCK : 0;
-
- ret = __sys_connect_file(req->file, &io->address,
- req->connect.addr_len, file_flags);
- if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req_has_async_data(req))
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- ret = -ENOMEM;
- goto out;
- }
- memcpy(req->async_data, &__io, sizeof(__io));
- return -EAGAIN;
- }
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
-out:
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-#else /* !CONFIG_NET */
-#define IO_NETOP_FN(op) \
-static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
-{ \
- return -EOPNOTSUPP; \
-}
-
-#define IO_NETOP_PREP(op) \
-IO_NETOP_FN(op) \
-static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
-{ \
- return -EOPNOTSUPP; \
-} \
-
-#define IO_NETOP_PREP_ASYNC(op) \
-IO_NETOP_PREP(op) \
-static int io_##op##_prep_async(struct io_kiocb *req) \
-{ \
- return -EOPNOTSUPP; \
-}
-
-IO_NETOP_PREP_ASYNC(sendmsg);
-IO_NETOP_PREP_ASYNC(recvmsg);
-IO_NETOP_PREP_ASYNC(connect);
-IO_NETOP_PREP(accept);
-IO_NETOP_FN(send);
-IO_NETOP_FN(recv);
-#endif /* CONFIG_NET */
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
-
-#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
-
-struct napi_entry {
- struct list_head list;
- unsigned int napi_id;
- unsigned long timeout;
-};
-
-/*
- * Add busy poll NAPI ID from sk.
- */
-static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
-{
- unsigned int napi_id;
- struct socket *sock;
- struct sock *sk;
- struct napi_entry *ne;
-
- if (!net_busy_loop_on())
- return;
-
- sock = sock_from_file(file);
- if (!sock)
- return;
-
- sk = sock->sk;
- if (!sk)
- return;
-
- napi_id = READ_ONCE(sk->sk_napi_id);
-
- /* Non-NAPI IDs can be rejected */
- if (napi_id < MIN_NAPI_ID)
- return;
-
- spin_lock(&ctx->napi_lock);
- list_for_each_entry(ne, &ctx->napi_list, list) {
- if (ne->napi_id == napi_id) {
- ne->timeout = jiffies + NAPI_TIMEOUT;
- goto out;
- }
- }
-
- ne = kmalloc(sizeof(*ne), GFP_NOWAIT);
- if (!ne)
- goto out;
-
- ne->napi_id = napi_id;
- ne->timeout = jiffies + NAPI_TIMEOUT;
- list_add_tail(&ne->list, &ctx->napi_list);
-out:
- spin_unlock(&ctx->napi_lock);
-}
-
-static inline void io_check_napi_entry_timeout(struct napi_entry *ne)
-{
- if (time_after(jiffies, ne->timeout)) {
- list_del(&ne->list);
- kfree(ne);
- }
-}
-
-/*
- * Busy poll if globally on and supporting sockets found
- */
-static bool io_napi_busy_loop(struct list_head *napi_list)
-{
- struct napi_entry *ne, *n;
-
- list_for_each_entry_safe(ne, n, napi_list, list) {
- napi_busy_loop(ne->napi_id, NULL, NULL, true,
- BUSY_POLL_BUDGET);
- io_check_napi_entry_timeout(ne);
- }
- return !list_empty(napi_list);
-}
-
-static void io_free_napi_list(struct io_ring_ctx *ctx)
-{
- spin_lock(&ctx->napi_lock);
- while (!list_empty(&ctx->napi_list)) {
- struct napi_entry *ne =
- list_first_entry(&ctx->napi_list, struct napi_entry,
- list);
-
- list_del(&ne->list);
- kfree(ne);
- }
- spin_unlock(&ctx->napi_lock);
-}
-#else
-static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
-{
-}
-
-static inline void io_free_napi_list(struct io_ring_ctx *ctx)
-{
-}
-#endif /* CONFIG_NET_RX_BUSY_POLL */
-
-struct io_poll_table {
- struct poll_table_struct pt;
- struct io_kiocb *req;
- int nr_entries;
- int error;
-};
-
-#define IO_POLL_CANCEL_FLAG BIT(31)
-#define IO_POLL_REF_MASK GENMASK(30, 0)
-
-/*
- * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
- * bump it and acquire ownership. It's disallowed to modify requests while not
- * owning it, that prevents from races for enqueueing task_work's and b/w
- * arming poll and wakeups.
- */
-static inline bool io_poll_get_ownership(struct io_kiocb *req)
-{
- return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
-}
-
-static void io_poll_mark_cancelled(struct io_kiocb *req)
-{
- atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
-}
-
-static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
-{
- /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
- if (req->opcode == IORING_OP_POLL_ADD)
- return req->async_data;
- return req->apoll->double_poll;
-}
-
-static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
-{
- if (req->opcode == IORING_OP_POLL_ADD)
- return &req->poll;
- return &req->apoll->poll;
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
- hlist_add_head(&req->hash_node, list);
-}
-
-static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
- wait_queue_func_t wake_func)
-{
- poll->head = NULL;
-#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
- /* mask in events that we always want/need */
- poll->events = events | IO_POLL_UNMASK;
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, wake_func);
-}
-
-static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
-{
- struct wait_queue_head *head = smp_load_acquire(&poll->head);
-
- if (head) {
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
- }
-}
-
-static void io_poll_remove_entries(struct io_kiocb *req)
-{
- /*
- * Nothing to do if neither of those flags are set. Avoid dipping
- * into the poll/apoll/double cachelines if we can.
- */
- if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
- return;
-
- /*
- * While we hold the waitqueue lock and the waitqueue is nonempty,
- * wake_up_pollfree() will wait for us. However, taking the waitqueue
- * lock in the first place can race with the waitqueue being freed.
- *
- * We solve this as eventpoll does: by taking advantage of the fact that
- * all users of wake_up_pollfree() will RCU-delay the actual free. If
- * we enter rcu_read_lock() and see that the pointer to the queue is
- * non-NULL, we can then lock it without the memory being freed out from
- * under us.
- *
- * Keep holding rcu_read_lock() as long as we hold the queue lock, in
- * case the caller deletes the entry from the queue, leaving it empty.
- * In that case, only RCU prevents the queue memory from being freed.
- */
- rcu_read_lock();
- if (req->flags & REQ_F_SINGLE_POLL)
- io_poll_remove_entry(io_poll_get_single(req));
- if (req->flags & REQ_F_DOUBLE_POLL)
- io_poll_remove_entry(io_poll_get_double(req));
- rcu_read_unlock();
-}
-
-/*
- * All poll tw should go through this. Checks for poll events, manages
- * references, does rewait, etc.
- *
- * Returns a negative error on failure. >0 when no action require, which is
- * either spurious wakeup or multishot CQE is served. 0 when it's done with
- * the request, then the mask is stored in req->result.
- */
-static int io_poll_check_events(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_poll_iocb *poll = io_poll_get_single(req);
- int v;
-
- /* req->task == current here, checking PF_EXITING is safe */
- if (unlikely(req->task->flags & PF_EXITING))
- io_poll_mark_cancelled(req);
-
- do {
- v = atomic_read(&req->poll_refs);
-
- /* tw handler should be the owner, and so have some references */
- if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
- return 0;
- if (v & IO_POLL_CANCEL_FLAG)
- return -ECANCELED;
-
- if (!req->result) {
- struct poll_table_struct pt = { ._key = req->cflags };
-
- req->result = vfs_poll(req->file, &pt) & req->cflags;
- }
-
- /* multishot, just fill an CQE and proceed */
- if (req->result && !(req->cflags & EPOLLONESHOT)) {
- __poll_t mask = mangle_poll(req->result & poll->events);
- bool filled;
-
- spin_lock(&ctx->completion_lock);
- filled = io_fill_cqe_aux(ctx, req->user_data, mask,
- IORING_CQE_F_MORE);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (unlikely(!filled))
- return -ECANCELED;
- io_cqring_ev_posted(ctx);
- io_add_napi(req->file, ctx);
- } else if (req->result) {
- return 0;
- }
-
- /*
- * Release all references, retry if someone tried to restart
- * task_work while we were executing it.
- */
- } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
-
- return 1;
-}
-
-static void io_poll_task_func(struct io_kiocb *req, bool *locked)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- ret = io_poll_check_events(req);
- if (ret > 0)
- return;
-
- if (!ret) {
- req->result = mangle_poll(req->result & req->poll.events);
- } else {
- req->result = ret;
- req_set_fail(req);
- }
-
- io_poll_remove_entries(req);
- spin_lock(&ctx->completion_lock);
- hash_del(&req->hash_node);
- __io_req_complete_post(req, req->result, 0);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
-static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- ret = io_poll_check_events(req);
- if (ret > 0)
- return;
-
- io_poll_remove_entries(req);
- spin_lock(&ctx->completion_lock);
- hash_del(&req->hash_node);
- spin_unlock(&ctx->completion_lock);
-
- if (!ret)
- io_req_task_submit(req, locked);
- else
- io_req_complete_failed(req, ret);
-}
-
-static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
-{
- req->result = mask;
- /*
- * This is useful for poll that is armed on behalf of another
- * request, and where the wakeup path could be on a different
- * CPU. We want to avoid pulling in req->apoll->events for that
- * case.
- */
- req->cflags = events;
- if (req->opcode == IORING_OP_POLL_ADD)
- req->io_task_work.func = io_poll_task_func;
- else
- req->io_task_work.func = io_apoll_task_func;
-
- trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
- io_req_task_work_add(req, false);
-}
-
-static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
-{
- if (io_poll_get_ownership(req))
- __io_poll_execute(req, res, events);
-}
-
-static void io_poll_cancel_req(struct io_kiocb *req)
-{
- io_poll_mark_cancelled(req);
- /* kick tw, which should complete the request */
- io_poll_execute(req, 0, 0);
-}
-
-#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
-#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
-
-static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
- void *key)
-{
- struct io_kiocb *req = wqe_to_req(wait);
- struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
- wait);
- __poll_t mask = key_to_poll(key);
-
- if (unlikely(mask & POLLFREE)) {
- io_poll_mark_cancelled(req);
- /* we have to kick tw in case it's not already */
- io_poll_execute(req, 0, poll->events);
-
- /*
- * If the waitqueue is being freed early but someone is already
- * holds ownership over it, we have to tear down the request as
- * best we can. That means immediately removing the request from
- * its waitqueue and preventing all further accesses to the
- * waitqueue via the request.
- */
- list_del_init(&poll->wait.entry);
-
- /*
- * Careful: this *must* be the last step, since as soon
- * as req->head is NULL'ed out, the request can be
- * completed and freed, since aio_poll_complete_work()
- * will no longer need to take the waitqueue lock.
- */
- smp_store_release(&poll->head, NULL);
- return 1;
- }
-
- /* for instances that support it check for an event match first */
- if (mask && !(mask & poll->events))
- return 0;
-
- if (io_poll_get_ownership(req)) {
- /* optional, saves extra locking for removal in tw handler */
- if (mask && poll->events & EPOLLONESHOT) {
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- if (wqe_is_double(wait))
- req->flags &= ~REQ_F_DOUBLE_POLL;
- else
- req->flags &= ~REQ_F_SINGLE_POLL;
- }
- __io_poll_execute(req, mask, poll->events);
- }
- return 1;
-}
-
-static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
- struct wait_queue_head *head,
- struct io_poll_iocb **poll_ptr)
-{
- struct io_kiocb *req = pt->req;
- unsigned long wqe_private = (unsigned long) req;
-
- /*
- * The file being polled uses multiple waitqueues for poll handling
- * (e.g. one for read, one for write). Setup a separate io_poll_iocb
- * if this happens.
- */
- if (unlikely(pt->nr_entries)) {
- struct io_poll_iocb *first = poll;
-
- /* double add on the same waitqueue head, ignore */
- if (first->head == head)
- return;
- /* already have a 2nd entry, fail a third attempt */
- if (*poll_ptr) {
- if ((*poll_ptr)->head == head)
- return;
- pt->error = -EINVAL;
- return;
- }
-
- poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
- if (!poll) {
- pt->error = -ENOMEM;
- return;
- }
- /* mark as double wq entry */
- wqe_private |= 1;
- req->flags |= REQ_F_DOUBLE_POLL;
- io_init_poll_iocb(poll, first->events, first->wait.func);
- *poll_ptr = poll;
- if (req->opcode == IORING_OP_POLL_ADD)
- req->flags |= REQ_F_ASYNC_DATA;
- }
-
- req->flags |= REQ_F_SINGLE_POLL;
- pt->nr_entries++;
- poll->head = head;
- poll->wait.private = (void *) wqe_private;
-
- if (poll->events & EPOLLEXCLUSIVE)
- add_wait_queue_exclusive(head, &poll->wait);
- else
- add_wait_queue(head, &poll->wait);
-}
-
-static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
-{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
-
- __io_queue_proc(&pt->req->poll, pt, head,
- (struct io_poll_iocb **) &pt->req->async_data);
-}
-
-static int __io_arm_poll_handler(struct io_kiocb *req,
- struct io_poll_iocb *poll,
- struct io_poll_table *ipt, __poll_t mask)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int v;
-
- INIT_HLIST_NODE(&req->hash_node);
- io_init_poll_iocb(poll, mask, io_poll_wake);
- poll->file = req->file;
-
- ipt->pt._key = mask;
- ipt->req = req;
- ipt->error = 0;
- ipt->nr_entries = 0;
-
- /*
- * Take the ownership to delay any tw execution up until we're done
- * with poll arming. see io_poll_get_ownership().
- */
- atomic_set(&req->poll_refs, 1);
- mask = vfs_poll(req->file, &ipt->pt) & poll->events;
-
- if (mask && (poll->events & EPOLLONESHOT)) {
- io_poll_remove_entries(req);
- /* no one else has access to the req, forget about the ref */
- return mask;
- }
- if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
- io_poll_remove_entries(req);
- if (!ipt->error)
- ipt->error = -EINVAL;
- return 0;
- }
-
- spin_lock(&ctx->completion_lock);
- io_poll_req_insert(req);
- spin_unlock(&ctx->completion_lock);
-
- if (mask) {
- /* can't multishot if failed, just queue the event we've got */
- if (unlikely(ipt->error || !ipt->nr_entries))
- poll->events |= EPOLLONESHOT;
- __io_poll_execute(req, mask, poll->events);
- return 0;
- }
- io_add_napi(req->file, req->ctx);
-
- /*
- * Release ownership. If someone tried to queue a tw while it was
- * locked, kick it off for them.
- */
- v = atomic_dec_return(&req->poll_refs);
- if (unlikely(v & IO_POLL_REF_MASK))
- __io_poll_execute(req, 0, poll->events);
- return 0;
-}
-
-static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
- struct poll_table_struct *p)
-{
- struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- struct async_poll *apoll = pt->req->apoll;
-
- __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
-}
-
-enum {
- IO_APOLL_OK,
- IO_APOLL_ABORTED,
- IO_APOLL_READY
-};
-
-static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
-{
- const struct io_op_def *def = &io_op_defs[req->opcode];
- struct io_ring_ctx *ctx = req->ctx;
- struct async_poll *apoll;
- struct io_poll_table ipt;
- __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
- int ret;
-
- if (!def->pollin && !def->pollout)
- return IO_APOLL_ABORTED;
- if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
- return IO_APOLL_ABORTED;
-
- if (def->pollin) {
- mask |= POLLIN | POLLRDNORM;
-
- /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
- if ((req->opcode == IORING_OP_RECVMSG) &&
- (req->sr_msg.msg_flags & MSG_ERRQUEUE))
- mask &= ~POLLIN;
- } else {
- mask |= POLLOUT | POLLWRNORM;
- }
- if (def->poll_exclusive)
- mask |= EPOLLEXCLUSIVE;
- if (!(issue_flags & IO_URING_F_UNLOCKED) &&
- !list_empty(&ctx->apoll_cache)) {
- apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
- poll.wait.entry);
- list_del_init(&apoll->poll.wait.entry);
- } else {
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
- }
- apoll->double_poll = NULL;
- req->apoll = apoll;
- req->flags |= REQ_F_POLLED;
- ipt.pt._qproc = io_async_queue_proc;
-
- io_kbuf_recycle(req, issue_flags);
-
- ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
- if (ret || ipt.error)
- return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
-
- trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
- mask, apoll->poll.events);
- return IO_APOLL_OK;
-}
-
-/*
- * Returns true if we found and killed one or more poll requests
- */
-static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
- struct task_struct *tsk, bool cancel_all)
-{
- struct hlist_node *tmp;
- struct io_kiocb *req;
- bool found = false;
- int i;
-
- spin_lock(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_match_task_safe(req, tsk, cancel_all)) {
- hlist_del_init(&req->hash_node);
- io_poll_cancel_req(req);
- found = true;
- }
- }
- }
- spin_unlock(&ctx->completion_lock);
- return found;
-}
-
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
- bool poll_only)
- __must_hold(&ctx->completion_lock)
-{
- struct hlist_head *list;
- struct io_kiocb *req;
-
- list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
- hlist_for_each_entry(req, list, hash_node) {
- if (sqe_addr != req->user_data)
- continue;
- if (poll_only && req->opcode != IORING_OP_POLL_ADD)
- continue;
- return req;
- }
- return NULL;
-}
-
-static bool io_poll_disarm(struct io_kiocb *req)
- __must_hold(&ctx->completion_lock)
-{
- if (!io_poll_get_ownership(req))
- return false;
- io_poll_remove_entries(req);
- hash_del(&req->hash_node);
- return true;
-}
-
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
- bool poll_only)
- __must_hold(&ctx->completion_lock)
-{
- struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
-
- if (!req)
- return -ENOENT;
- io_poll_cancel_req(req);
- return 0;
-}
-
-static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
- unsigned int flags)
-{
- u32 events;
-
- events = READ_ONCE(sqe->poll32_events);
-#ifdef __BIG_ENDIAN
- events = swahw32(events);
-#endif
- if (!(flags & IORING_POLL_ADD_MULTI))
- events |= EPOLLONESHOT;
- return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
-}
-
-static int io_poll_update_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_poll_update *upd = &req->poll_update;
- u32 flags;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- flags = READ_ONCE(sqe->len);
- if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
- IORING_POLL_ADD_MULTI))
- return -EINVAL;
- /* meaningless without update */
- if (flags == IORING_POLL_ADD_MULTI)
- return -EINVAL;
-
- upd->old_user_data = READ_ONCE(sqe->addr);
- upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
- upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
-
- upd->new_user_data = READ_ONCE(sqe->off);
- if (!upd->update_user_data && upd->new_user_data)
- return -EINVAL;
- if (upd->update_events)
- upd->events = io_poll_parse_events(sqe, flags);
- else if (sqe->poll32_events)
- return -EINVAL;
-
- return 0;
-}
-
-static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- struct io_poll_iocb *poll = &req->poll;
- u32 flags;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
- return -EINVAL;
- flags = READ_ONCE(sqe->len);
- if (flags & ~IORING_POLL_ADD_MULTI)
- return -EINVAL;
- if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
- return -EINVAL;
-
- io_req_set_refcount(req);
- req->cflags = poll->events = io_poll_parse_events(sqe, flags);
- return 0;
-}
-
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_poll_iocb *poll = &req->poll;
- struct io_poll_table ipt;
- int ret;
-
- ipt.pt._qproc = io_poll_queue_proc;
-
- ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
- ret = ret ?: ipt.error;
- if (ret)
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *preq;
- int ret2, ret = 0;
- bool locked;
-
- spin_lock(&ctx->completion_lock);
- preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
- if (!preq || !io_poll_disarm(preq)) {
- spin_unlock(&ctx->completion_lock);
- ret = preq ? -EALREADY : -ENOENT;
- goto out;
- }
- spin_unlock(&ctx->completion_lock);
-
- if (req->poll_update.update_events || req->poll_update.update_user_data) {
- /* only mask one event flags, keep behavior flags */
- if (req->poll_update.update_events) {
- preq->poll.events &= ~0xffff;
- preq->poll.events |= req->poll_update.events & 0xffff;
- preq->poll.events |= IO_POLL_UNMASK;
- }
- if (req->poll_update.update_user_data)
- preq->user_data = req->poll_update.new_user_data;
-
- ret2 = io_poll_add(preq, issue_flags);
- /* successfully updated, don't complete poll request */
- if (!ret2)
- goto out;
- }
-
- req_set_fail(preq);
- preq->result = -ECANCELED;
- locked = !(issue_flags & IO_URING_F_UNLOCKED);
- io_req_task_complete(preq, &locked);
-out:
- if (ret < 0)
- req_set_fail(req);
- /* complete update request, we're done with it */
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
-{
- struct io_timeout_data *data = container_of(timer,
- struct io_timeout_data, timer);
- struct io_kiocb *req = data->req;
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->timeout_lock, flags);
- list_del_init(&req->timeout.list);
- atomic_set(&req->ctx->cq_timeouts,
- atomic_read(&req->ctx->cq_timeouts) + 1);
- spin_unlock_irqrestore(&ctx->timeout_lock, flags);
-
- if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
- req_set_fail(req);
-
- req->result = -ETIME;
- req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, false);
- return HRTIMER_NORESTART;
-}
-
-static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
- __u64 user_data)
- __must_hold(&ctx->timeout_lock)
-{
- struct io_timeout_data *io;
- struct io_kiocb *req;
- bool found = false;
-
- list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- found = user_data == req->user_data;
- if (found)
- break;
- }
- if (!found)
- return ERR_PTR(-ENOENT);
-
- io = req->async_data;
- if (hrtimer_try_to_cancel(&io->timer) == -1)
- return ERR_PTR(-EALREADY);
- list_del_init(&req->timeout.list);
- return req;
-}
-
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
- __must_hold(&ctx->completion_lock)
- __must_hold(&ctx->timeout_lock)
-{
- struct io_kiocb *req = io_timeout_extract(ctx, user_data);
-
- if (IS_ERR(req))
- return PTR_ERR(req);
- io_req_task_queue_fail(req, -ECANCELED);
- return 0;
-}
-
-static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
-{
- switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
- case IORING_TIMEOUT_BOOTTIME:
- return CLOCK_BOOTTIME;
- case IORING_TIMEOUT_REALTIME:
- return CLOCK_REALTIME;
- default:
- /* can't happen, vetted at prep time */
- WARN_ON_ONCE(1);
- fallthrough;
- case 0:
- return CLOCK_MONOTONIC;
- }
-}
-
-static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
- struct timespec64 *ts, enum hrtimer_mode mode)
- __must_hold(&ctx->timeout_lock)
-{
- struct io_timeout_data *io;
- struct io_kiocb *req;
- bool found = false;
-
- list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
- found = user_data == req->user_data;
- if (found)
- break;
- }
- if (!found)
- return -ENOENT;
-
- io = req->async_data;
- if (hrtimer_try_to_cancel(&io->timer) == -1)
- return -EALREADY;
- hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
- io->timer.function = io_link_timeout_fn;
- hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
- return 0;
-}
-
-static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
- struct timespec64 *ts, enum hrtimer_mode mode)
- __must_hold(&ctx->timeout_lock)
-{
- struct io_kiocb *req = io_timeout_extract(ctx, user_data);
- struct io_timeout_data *data;
-
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- req->timeout.off = 0; /* noseq */
- data = req->async_data;
- list_add_tail(&req->timeout.list, &ctx->timeout_list);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
- data->timer.function = io_timeout_fn;
- hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
- return 0;
-}
-
-static int io_timeout_remove_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- struct io_timeout_rem *tr = &req->timeout_rem;
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
- return -EINVAL;
-
- tr->ltimeout = false;
- tr->addr = READ_ONCE(sqe->addr);
- tr->flags = READ_ONCE(sqe->timeout_flags);
- if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
- if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
- return -EINVAL;
- if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
- tr->ltimeout = true;
- if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
- return -EINVAL;
- if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
- return -EFAULT;
- if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
- return -EINVAL;
- } else if (tr->flags) {
- /* timeout removal doesn't support flags */
- return -EINVAL;
- }
-
- return 0;
-}
-
-static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
-{
- return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
- : HRTIMER_MODE_REL;
-}
-
-/*
- * Remove or update an existing timeout command
- */
-static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_timeout_rem *tr = &req->timeout_rem;
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
- spin_lock(&ctx->completion_lock);
- spin_lock_irq(&ctx->timeout_lock);
- ret = io_timeout_cancel(ctx, tr->addr);
- spin_unlock_irq(&ctx->timeout_lock);
- spin_unlock(&ctx->completion_lock);
- } else {
- enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
-
- spin_lock_irq(&ctx->timeout_lock);
- if (tr->ltimeout)
- ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
- else
- ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
- spin_unlock_irq(&ctx->timeout_lock);
- }
-
- if (ret < 0)
- req_set_fail(req);
- io_req_complete_post(req, ret, 0);
- return 0;
-}
-
-static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool is_timeout_link)
-{
- struct io_timeout_data *data;
- unsigned flags;
- u32 off = READ_ONCE(sqe->off);
-
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (off && is_timeout_link)
- return -EINVAL;
- flags = READ_ONCE(sqe->timeout_flags);
- if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
- IORING_TIMEOUT_ETIME_SUCCESS))
- return -EINVAL;
- /* more than one clock specified is invalid, obviously */
- if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
- return -EINVAL;
-
- INIT_LIST_HEAD(&req->timeout.list);
- req->timeout.off = off;
- if (unlikely(off && !req->ctx->off_timeout_used))
- req->ctx->off_timeout_used = true;
-
- if (WARN_ON_ONCE(req_has_async_data(req)))
- return -EFAULT;
- if (io_alloc_async_data(req))
- return -ENOMEM;
-
- data = req->async_data;
- data->req = req;
- data->flags = flags;
-
- if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
- return -EFAULT;
-
- if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
- return -EINVAL;
-
- data->mode = io_translate_timeout_mode(flags);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
-
- if (is_timeout_link) {
- struct io_submit_link *link = &req->ctx->submit_state.link;
-
- if (!link->head)
- return -EINVAL;
- if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
- return -EINVAL;
- req->timeout.head = link->last;
- link->last->flags |= REQ_F_ARM_LTIMEOUT;
- }
- return 0;
-}
-
-static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_timeout_data *data = req->async_data;
- struct list_head *entry;
- u32 tail, off = req->timeout.off;
-
- spin_lock_irq(&ctx->timeout_lock);
-
- /*
- * sqe->off holds how many events that need to occur for this
- * timeout event to be satisfied. If it isn't set, then this is
- * a pure timeout request, sequence isn't used.
- */
- if (io_is_timeout_noseq(req)) {
- entry = ctx->timeout_list.prev;
- goto add;
- }
-
- tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- req->timeout.target_seq = tail + off;
-
- /* Update the last seq here in case io_flush_timeouts() hasn't.
- * This is safe because ->completion_lock is held, and submissions
- * and completions are never mixed in the same ->completion_lock section.
- */
- ctx->cq_last_tm_flush = tail;
-
- /*
- * Insertion sort, ensuring the first entry in the list is always
- * the one we need first.
- */
- list_for_each_prev(entry, &ctx->timeout_list) {
- struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
- timeout.list);
-
- if (io_is_timeout_noseq(nxt))
- continue;
- /* nxt.seq is behind @tail, otherwise would've been completed */
- if (off >= nxt->timeout.target_seq - tail)
- break;
- }
-add:
- list_add(&req->timeout.list, entry);
- data->timer.function = io_timeout_fn;
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
- spin_unlock_irq(&ctx->timeout_lock);
- return 0;
-}
-
-struct io_cancel_data {
- struct io_ring_ctx *ctx;
- u64 user_data;
-};
-
-static bool io_cancel_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_cancel_data *cd = data;
-
- return req->ctx == cd->ctx && req->user_data == cd->user_data;
-}
-
-static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
- struct io_ring_ctx *ctx)
-{
- struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
- enum io_wq_cancel cancel_ret;
- int ret = 0;
-
- if (!tctx || !tctx->io_wq)
- return -ENOENT;
-
- cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
- switch (cancel_ret) {
- case IO_WQ_CANCEL_OK:
- ret = 0;
- break;
- case IO_WQ_CANCEL_RUNNING:
- ret = -EALREADY;
- break;
- case IO_WQ_CANCEL_NOTFOUND:
- ret = -ENOENT;
- break;
- }
-
- return ret;
-}
-
-static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
-
- ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- /*
- * Fall-through even for -EALREADY, as we may have poll armed
- * that need unarming.
- */
- if (!ret)
- return 0;
-
- spin_lock(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, sqe_addr, false);
- if (ret != -ENOENT)
- goto out;
-
- spin_lock_irq(&ctx->timeout_lock);
- ret = io_timeout_cancel(ctx, sqe_addr);
- spin_unlock_irq(&ctx->timeout_lock);
-out:
- spin_unlock(&ctx->completion_lock);
- return ret;
-}
-
-static int io_async_cancel_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
- sqe->splice_fd_in)
- return -EINVAL;
-
- req->cancel.addr = READ_ONCE(sqe->addr);
- return 0;
-}
-
-static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- u64 sqe_addr = req->cancel.addr;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- struct io_tctx_node *node;
- int ret;
-
- ret = io_try_cancel_userdata(req, sqe_addr);
- if (ret != -ENOENT)
- goto done;
-
- /* slow path, try all io-wq's */
- io_ring_submit_lock(ctx, needs_lock);
- ret = -ENOENT;
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
- if (ret != -ENOENT)
- break;
- }
- io_ring_submit_unlock(ctx, needs_lock);
-done:
- if (ret < 0)
- req_set_fail(req);
- io_req_complete_post(req, ret, 0);
- return 0;
-}
-
-static int io_rsrc_update_prep(struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
-{
- if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
- return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
- return -EINVAL;
-
- req->rsrc_update.offset = READ_ONCE(sqe->off);
- req->rsrc_update.nr_args = READ_ONCE(sqe->len);
- if (!req->rsrc_update.nr_args)
- return -EINVAL;
- req->rsrc_update.arg = READ_ONCE(sqe->addr);
- return 0;
-}
-
-static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- struct io_uring_rsrc_update2 up;
- int ret;
-
- up.offset = req->rsrc_update.offset;
- up.data = req->rsrc_update.arg;
- up.nr = 0;
- up.tags = 0;
- up.resv = 0;
-
- io_ring_submit_lock(ctx, needs_lock);
- ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
- &up, req->rsrc_update.nr_args);
- io_ring_submit_unlock(ctx, needs_lock);
-
- if (ret < 0)
- req_set_fail(req);
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
-}
-
-static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
- switch (req->opcode) {
- case IORING_OP_NOP:
- return 0;
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- return io_read_prep(req, sqe);
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- return io_write_prep(req, sqe);
- case IORING_OP_POLL_ADD:
- return io_poll_add_prep(req, sqe);
- case IORING_OP_POLL_REMOVE:
- return io_poll_update_prep(req, sqe);
- case IORING_OP_FSYNC:
- return io_fsync_prep(req, sqe);
- case IORING_OP_SYNC_FILE_RANGE:
- return io_sfr_prep(req, sqe);
- case IORING_OP_SENDMSG:
- case IORING_OP_SEND:
- return io_sendmsg_prep(req, sqe);
- case IORING_OP_RECVMSG:
- case IORING_OP_RECV:
- return io_recvmsg_prep(req, sqe);
- case IORING_OP_CONNECT:
- return io_connect_prep(req, sqe);
- case IORING_OP_TIMEOUT:
- return io_timeout_prep(req, sqe, false);
- case IORING_OP_TIMEOUT_REMOVE:
- return io_timeout_remove_prep(req, sqe);
- case IORING_OP_ASYNC_CANCEL:
- return io_async_cancel_prep(req, sqe);
- case IORING_OP_LINK_TIMEOUT:
- return io_timeout_prep(req, sqe, true);
- case IORING_OP_ACCEPT:
- return io_accept_prep(req, sqe);
- case IORING_OP_FALLOCATE:
- return io_fallocate_prep(req, sqe);
- case IORING_OP_OPENAT:
- return io_openat_prep(req, sqe);
- case IORING_OP_CLOSE:
- return io_close_prep(req, sqe);
- case IORING_OP_FILES_UPDATE:
- return io_rsrc_update_prep(req, sqe);
- case IORING_OP_STATX:
- return io_statx_prep(req, sqe);
- case IORING_OP_FADVISE:
- return io_fadvise_prep(req, sqe);
- case IORING_OP_MADVISE:
- return io_madvise_prep(req, sqe);
- case IORING_OP_OPENAT2:
- return io_openat2_prep(req, sqe);
- case IORING_OP_EPOLL_CTL:
- return io_epoll_ctl_prep(req, sqe);
- case IORING_OP_SPLICE:
- return io_splice_prep(req, sqe);
- case IORING_OP_PROVIDE_BUFFERS:
- return io_provide_buffers_prep(req, sqe);
- case IORING_OP_REMOVE_BUFFERS:
- return io_remove_buffers_prep(req, sqe);
- case IORING_OP_TEE:
- return io_tee_prep(req, sqe);
- case IORING_OP_SHUTDOWN:
- return io_shutdown_prep(req, sqe);
- case IORING_OP_RENAMEAT:
- return io_renameat_prep(req, sqe);
- case IORING_OP_UNLINKAT:
- return io_unlinkat_prep(req, sqe);
- case IORING_OP_MKDIRAT:
- return io_mkdirat_prep(req, sqe);
- case IORING_OP_SYMLINKAT:
- return io_symlinkat_prep(req, sqe);
- case IORING_OP_LINKAT:
- return io_linkat_prep(req, sqe);
- case IORING_OP_MSG_RING:
- return io_msg_ring_prep(req, sqe);
- }
-
- printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
- req->opcode);
- return -EINVAL;
-}
-
-static int io_req_prep_async(struct io_kiocb *req)
-{
- if (!io_op_defs[req->opcode].needs_async_setup)
- return 0;
- if (WARN_ON_ONCE(req_has_async_data(req)))
- return -EFAULT;
- if (io_alloc_async_data(req))
- return -EAGAIN;
-
- switch (req->opcode) {
- case IORING_OP_READV:
- return io_rw_prep_async(req, READ);
- case IORING_OP_WRITEV:
- return io_rw_prep_async(req, WRITE);
- case IORING_OP_SENDMSG:
- return io_sendmsg_prep_async(req);
- case IORING_OP_RECVMSG:
- return io_recvmsg_prep_async(req);
- case IORING_OP_CONNECT:
- return io_connect_prep_async(req);
- }
- printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
- req->opcode);
- return -EFAULT;
-}
-
-static u32 io_get_sequence(struct io_kiocb *req)
-{
- u32 seq = req->ctx->cached_sq_head;
-
- /* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(req, req)
- seq--;
- return seq;
-}
-
-static __cold void io_drain_req(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_defer_entry *de;
- int ret;
- u32 seq = io_get_sequence(req);
-
- /* Still need defer if there is pending req in defer list. */
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
-queue:
- ctx->drain_active = false;
- io_req_task_queue(req);
- return;
- }
- spin_unlock(&ctx->completion_lock);
-
- ret = io_req_prep_async(req);
- if (ret) {
-fail:
- io_req_complete_failed(req, ret);
- return;
- }
- io_prep_async_link(req);
- de = kmalloc(sizeof(*de), GFP_KERNEL);
- if (!de) {
- ret = -ENOMEM;
- goto fail;
- }
-
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
- kfree(de);
- goto queue;
- }
-
- trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
- de->req = req;
- de->seq = seq;
- list_add_tail(&de->list, &ctx->defer_list);
- spin_unlock(&ctx->completion_lock);
-}
-
-static void io_clean_op(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- spin_lock(&req->ctx->completion_lock);
- io_put_kbuf_comp(req);
- spin_unlock(&req->ctx->completion_lock);
- }
-
- if (req->flags & REQ_F_NEED_CLEANUP) {
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE: {
- struct io_async_rw *io = req->async_data;
-
- kfree(io->free_iovec);
- break;
- }
- case IORING_OP_RECVMSG:
- case IORING_OP_SENDMSG: {
- struct io_async_msghdr *io = req->async_data;
-
- kfree(io->free_iov);
- break;
- }
- case IORING_OP_SPLICE:
- case IORING_OP_TEE:
- if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
- io_put_file(req->splice.file_in);
- break;
- case IORING_OP_OPENAT:
- case IORING_OP_OPENAT2:
- if (req->open.filename)
- putname(req->open.filename);
- break;
- case IORING_OP_RENAMEAT:
- putname(req->rename.oldpath);
- putname(req->rename.newpath);
- break;
- case IORING_OP_UNLINKAT:
- putname(req->unlink.filename);
- break;
- case IORING_OP_MKDIRAT:
- putname(req->mkdir.filename);
- break;
- case IORING_OP_SYMLINKAT:
- putname(req->symlink.oldpath);
- putname(req->symlink.newpath);
- break;
- case IORING_OP_LINKAT:
- putname(req->hardlink.oldpath);
- putname(req->hardlink.newpath);
- break;
- case IORING_OP_STATX:
- if (req->statx.filename)
- putname(req->statx.filename);
- break;
- }
- }
- if ((req->flags & REQ_F_POLLED) && req->apoll) {
- kfree(req->apoll->double_poll);
- kfree(req->apoll);
- req->apoll = NULL;
- }
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_uring_task *tctx = req->task->io_uring;
-
- atomic_dec(&tctx->inflight_tracked);
- }
- if (req->flags & REQ_F_CREDS)
- put_cred(req->creds);
- if (req->flags & REQ_F_ASYNC_DATA) {
- kfree(req->async_data);
- req->async_data = NULL;
- }
- req->flags &= ~IO_REQ_CLEAN_FLAGS;
-}
-
-static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
-{
- const struct cred *creds = NULL;
- int ret;
-
- if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
- creds = override_creds(req->creds);
-
- if (!io_op_defs[req->opcode].audit_skip)
- audit_uring_entry(req->opcode);
-
- switch (req->opcode) {
- case IORING_OP_NOP:
- ret = io_nop(req, issue_flags);
- break;
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- ret = io_read(req, issue_flags);
- break;
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- ret = io_write(req, issue_flags);
- break;
- case IORING_OP_FSYNC:
- ret = io_fsync(req, issue_flags);
- break;
- case IORING_OP_POLL_ADD:
- ret = io_poll_add(req, issue_flags);
- break;
- case IORING_OP_POLL_REMOVE:
- ret = io_poll_update(req, issue_flags);
- break;
- case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, issue_flags);
- break;
- case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, issue_flags);
- break;
- case IORING_OP_SEND:
- ret = io_send(req, issue_flags);
- break;
- case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, issue_flags);
- break;
- case IORING_OP_RECV:
- ret = io_recv(req, issue_flags);
- break;
- case IORING_OP_TIMEOUT:
- ret = io_timeout(req, issue_flags);
- break;
- case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove(req, issue_flags);
- break;
- case IORING_OP_ACCEPT:
- ret = io_accept(req, issue_flags);
- break;
- case IORING_OP_CONNECT:
- ret = io_connect(req, issue_flags);
- break;
- case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel(req, issue_flags);
- break;
- case IORING_OP_FALLOCATE:
- ret = io_fallocate(req, issue_flags);
- break;
- case IORING_OP_OPENAT:
- ret = io_openat(req, issue_flags);
- break;
- case IORING_OP_CLOSE:
- ret = io_close(req, issue_flags);
- break;
- case IORING_OP_FILES_UPDATE:
- ret = io_files_update(req, issue_flags);
- break;
- case IORING_OP_STATX:
- ret = io_statx(req, issue_flags);
- break;
- case IORING_OP_FADVISE:
- ret = io_fadvise(req, issue_flags);
- break;
- case IORING_OP_MADVISE:
- ret = io_madvise(req, issue_flags);
- break;
- case IORING_OP_OPENAT2:
- ret = io_openat2(req, issue_flags);
- break;
- case IORING_OP_EPOLL_CTL:
- ret = io_epoll_ctl(req, issue_flags);
- break;
- case IORING_OP_SPLICE:
- ret = io_splice(req, issue_flags);
- break;
- case IORING_OP_PROVIDE_BUFFERS:
- ret = io_provide_buffers(req, issue_flags);
- break;
- case IORING_OP_REMOVE_BUFFERS:
- ret = io_remove_buffers(req, issue_flags);
- break;
- case IORING_OP_TEE:
- ret = io_tee(req, issue_flags);
- break;
- case IORING_OP_SHUTDOWN:
- ret = io_shutdown(req, issue_flags);
- break;
- case IORING_OP_RENAMEAT:
- ret = io_renameat(req, issue_flags);
- break;
- case IORING_OP_UNLINKAT:
- ret = io_unlinkat(req, issue_flags);
- break;
- case IORING_OP_MKDIRAT:
- ret = io_mkdirat(req, issue_flags);
- break;
- case IORING_OP_SYMLINKAT:
- ret = io_symlinkat(req, issue_flags);
- break;
- case IORING_OP_LINKAT:
- ret = io_linkat(req, issue_flags);
- break;
- case IORING_OP_MSG_RING:
- ret = io_msg_ring(req, issue_flags);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- if (!io_op_defs[req->opcode].audit_skip)
- audit_uring_exit(!ret, ret);
-
- if (creds)
- revert_creds(creds);
- if (ret)
- return ret;
- /* If the op doesn't have a file, we're not polling for it */
- if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
- io_iopoll_req_issued(req, issue_flags);
-
- return 0;
-}
-
-static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- req = io_put_req_find_next(req);
- return req ? &req->work : NULL;
-}
-
-static void io_wq_submit_work(struct io_wq_work *work)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- unsigned int issue_flags = IO_URING_F_UNLOCKED;
- bool needs_poll = false;
- struct io_kiocb *timeout;
- int ret = 0;
-
- /* one will be dropped by ->io_free_work() after returning to io-wq */
- if (!(req->flags & REQ_F_REFCOUNT))
- __io_req_set_refcount(req, 2);
- else
- req_ref_get(req);
-
- timeout = io_prep_linked_timeout(req);
- if (timeout)
- io_queue_linked_timeout(timeout);
-
- /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL) {
- io_req_task_queue_fail(req, -ECANCELED);
- return;
- }
-
- if (req->flags & REQ_F_FORCE_ASYNC) {
- const struct io_op_def *def = &io_op_defs[req->opcode];
- bool opcode_poll = def->pollin || def->pollout;
-
- if (opcode_poll && file_can_poll(req->file)) {
- needs_poll = true;
- issue_flags |= IO_URING_F_NONBLOCK;
- }
- }
-
- do {
- ret = io_issue_sqe(req, issue_flags);
- if (ret != -EAGAIN)
- break;
- /*
- * We can get EAGAIN for iopolled IO even though we're
- * forcing a sync submission from here, since we can't
- * wait for request slots on the block side.
- */
- if (!needs_poll) {
- cond_resched();
- continue;
- }
-
- if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
- return;
- /* aborted or ready, in either case retry blocking */
- needs_poll = false;
- issue_flags &= ~IO_URING_F_NONBLOCK;
- } while (1);
-
- /* avoid locking problems by failing it from a clean context */
- if (ret)
- io_req_task_queue_fail(req, ret);
-}
-
-static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
- unsigned i)
-{
- return &table->files[i];
-}
-
-static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
- int index)
-{
- struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
-
- return (struct file *) (slot->file_ptr & FFS_MASK);
-}
-
-static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
-{
- unsigned long file_ptr = (unsigned long) file;
-
- file_ptr |= io_file_get_flags(file);
- file_slot->file_ptr = file_ptr;
-}
-
-static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
-{
- struct file *file;
- unsigned long file_ptr;
-
- if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- return NULL;
- fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
- file = (struct file *) (file_ptr & FFS_MASK);
- file_ptr &= ~FFS_MASK;
- /* mask in overlapping REQ_F and FFS bits */
- req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
- io_req_set_rsrc_node(req, ctx);
- return file;
-}
-
-static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd)
-{
- struct file *file = fget(fd);
-
- trace_io_uring_file_get(ctx, req, req->user_data, fd);
-
- /* we don't allow fixed io_uring files */
- if (file && unlikely(file->f_op == &io_uring_fops))
- io_req_track_inflight(req);
- return file;
-}
-
-static inline struct file *io_file_get(struct io_ring_ctx *ctx,
- struct io_kiocb *req, int fd, bool fixed)
-{
- if (fixed)
- return io_file_get_fixed(ctx, req, fd);
- else
- return io_file_get_normal(ctx, req, fd);
-}
-
-static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
-{
- struct io_kiocb *prev = req->timeout.prev;
- int ret = -ENOENT;
-
- if (prev) {
- if (!(req->task->flags & PF_EXITING))
- ret = io_try_cancel_userdata(req, prev->user_data);
- io_req_complete_post(req, ret ?: -ETIME, 0);
- io_put_req(prev);
- } else {
- io_req_complete_post(req, -ETIME, 0);
- }
-}
-
-static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
-{
- struct io_timeout_data *data = container_of(timer,
- struct io_timeout_data, timer);
- struct io_kiocb *prev, *req = data->req;
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->timeout_lock, flags);
- prev = req->timeout.head;
- req->timeout.head = NULL;
-
- /*
- * We don't expect the list to be empty, that will only happen if we
- * race with the completion of the linked work.
- */
- if (prev) {
- io_remove_next_linked(prev);
- if (!req_ref_inc_not_zero(prev))
- prev = NULL;
- }
- list_del(&req->timeout.list);
- req->timeout.prev = prev;
- spin_unlock_irqrestore(&ctx->timeout_lock, flags);
-
- req->io_task_work.func = io_req_task_link_timeout;
- io_req_task_work_add(req, false);
- return HRTIMER_NORESTART;
-}
-
-static void io_queue_linked_timeout(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock_irq(&ctx->timeout_lock);
- /*
- * If the back reference is NULL, then our linked request finished
- * before we got a chance to setup the timer
- */
- if (req->timeout.head) {
- struct io_timeout_data *data = req->async_data;
-
- data->timer.function = io_link_timeout_fn;
- hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
- data->mode);
- list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
- }
- spin_unlock_irq(&ctx->timeout_lock);
- /* drop submission reference */
- io_put_req(req);
-}
-
-static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
-
- switch (io_arm_poll_handler(req, 0)) {
- case IO_APOLL_READY:
- io_req_task_queue(req);
- break;
- case IO_APOLL_ABORTED:
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- io_queue_async_work(req, NULL);
- break;
- case IO_APOLL_OK:
- break;
- }
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
-}
-
-static inline void __io_queue_sqe(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- struct io_kiocb *linked_timeout;
- int ret;
-
- ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
-
- if (req->flags & REQ_F_COMPLETE_INLINE) {
- io_req_add_compl_list(req);
- return;
- }
- /*
- * We async punt it if the file wasn't marked NOWAIT, or if the file
- * doesn't support non-blocking read/write attempts
- */
- if (likely(!ret)) {
- linked_timeout = io_prep_linked_timeout(req);
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
- } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- io_queue_sqe_arm_apoll(req);
- } else {
- io_req_complete_failed(req, ret);
- }
-}
-
-static void io_queue_sqe_fallback(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- if (req->flags & REQ_F_FAIL) {
- io_req_complete_fail_submit(req);
- } else if (unlikely(req->ctx->drain_active)) {
- io_drain_req(req);
- } else {
- int ret = io_req_prep_async(req);
-
- if (unlikely(ret))
- io_req_complete_failed(req, ret);
- else
- io_queue_async_work(req, NULL);
- }
-}
-
-static inline void io_queue_sqe(struct io_kiocb *req)
- __must_hold(&req->ctx->uring_lock)
-{
- if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
- __io_queue_sqe(req);
- else
- io_queue_sqe_fallback(req);
-}
-
-/*
- * Check SQE restrictions (opcode and flags).
- *
- * Returns 'true' if SQE is allowed, 'false' otherwise.
- */
-static inline bool io_check_restriction(struct io_ring_ctx *ctx,
- struct io_kiocb *req,
- unsigned int sqe_flags)
-{
- if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
- return false;
-
- if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
- ctx->restrictions.sqe_flags_required)
- return false;
-
- if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
- ctx->restrictions.sqe_flags_required))
- return false;
-
- return true;
-}
-
-static void io_init_req_drain(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *head = ctx->submit_state.link.head;
-
- ctx->drain_active = true;
- if (head) {
- /*
- * If we need to drain a request in the middle of a link, drain
- * the head request and the next request/link after the current
- * link. Considering sequential execution of links,
- * REQ_F_IO_DRAIN will be maintained for every request of our
- * link.
- */
- head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
- ctx->drain_next = true;
- }
-}
-
-static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
- __must_hold(&ctx->uring_lock)
-{
- unsigned int sqe_flags;
- int personality;
- u8 opcode;
-
- /* req is partially pre-initialised, see io_preinit_req() */
- req->opcode = opcode = READ_ONCE(sqe->opcode);
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags = sqe_flags = READ_ONCE(sqe->flags);
- req->user_data = READ_ONCE(sqe->user_data);
- req->file = NULL;
- req->fixed_rsrc_refs = NULL;
- req->task = current;
-
- if (unlikely(opcode >= IORING_OP_LAST)) {
- req->opcode = 0;
- return -EINVAL;
- }
- if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
- /* enforce forwards compatibility on users */
- if (sqe_flags & ~SQE_VALID_FLAGS)
- return -EINVAL;
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[opcode].buffer_select)
- return -EOPNOTSUPP;
- if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
- ctx->drain_disabled = true;
- if (sqe_flags & IOSQE_IO_DRAIN) {
- if (ctx->drain_disabled)
- return -EOPNOTSUPP;
- io_init_req_drain(req);
- }
- }
- if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
- if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
- return -EACCES;
- /* knock it to the slow queue path, will be drained there */
- if (ctx->drain_active)
- req->flags |= REQ_F_FORCE_ASYNC;
- /* if there is no link, we're at "next" request and need to drain */
- if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
- ctx->drain_next = false;
- ctx->drain_active = true;
- req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
- }
- }
-
- if (io_op_defs[opcode].needs_file) {
- struct io_submit_state *state = &ctx->submit_state;
-
- /*
- * Plug now if we have more than 2 IO left after this, and the
- * target is potentially a read/write to block based storage.
- */
- if (state->need_plug && io_op_defs[opcode].plug) {
- state->plug_started = true;
- state->need_plug = false;
- blk_start_plug_nr_ios(&state->plug, state->submit_nr);
- }
-
- req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
- (sqe_flags & IOSQE_FIXED_FILE));
- if (unlikely(!req->file))
- return -EBADF;
- }
-
- personality = READ_ONCE(sqe->personality);
- if (personality) {
- int ret;
-
- req->creds = xa_load(&ctx->personalities, personality);
- if (!req->creds)
- return -EINVAL;
- get_cred(req->creds);
- ret = security_uring_override_creds(req->creds);
- if (ret) {
- put_cred(req->creds);
- return ret;
- }
- req->flags |= REQ_F_CREDS;
- }
-
- return io_req_prep(req, sqe);
-}
-
-static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
- __must_hold(&ctx->uring_lock)
-{
- struct io_submit_link *link = &ctx->submit_state.link;
- int ret;
-
- ret = io_init_req(ctx, req, sqe);
- if (unlikely(ret)) {
- trace_io_uring_req_failed(sqe, ctx, req, ret);
-
- /* fail even hard links since we don't submit */
- if (link->head) {
- /*
- * we can judge a link req is failed or cancelled by if
- * REQ_F_FAIL is set, but the head is an exception since
- * it may be set REQ_F_FAIL because of other req's failure
- * so let's leverage req->result to distinguish if a head
- * is set REQ_F_FAIL because of its failure or other req's
- * failure so that we can set the correct ret code for it.
- * init result here to avoid affecting the normal path.
- */
- if (!(link->head->flags & REQ_F_FAIL))
- req_fail_link_node(link->head, -ECANCELED);
- } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- /*
- * the current req is a normal req, we should return
- * error and thus break the submittion loop.
- */
- io_req_complete_failed(req, ret);
- return ret;
- }
- req_fail_link_node(req, ret);
- }
-
- /* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
- req->flags, true,
- ctx->flags & IORING_SETUP_SQPOLL);
-
- /*
- * If we already have a head request, queue this one for async
- * submittal once the head completes. If we don't have a head but
- * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
- * submitted sync once the chain is complete. If none of those
- * conditions are true (normal request), then just queue it.
- */
- if (link->head) {
- struct io_kiocb *head = link->head;
-
- if (!(req->flags & REQ_F_FAIL)) {
- ret = io_req_prep_async(req);
- if (unlikely(ret)) {
- req_fail_link_node(req, ret);
- if (!(head->flags & REQ_F_FAIL))
- req_fail_link_node(head, -ECANCELED);
- }
- }
- trace_io_uring_link(ctx, req, head);
- link->last->link = req;
- link->last = req;
-
- if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
- return 0;
- /* last request of a link, enqueue the link */
- link->head = NULL;
- req = head;
- } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- link->head = req;
- link->last = req;
- return 0;
- }
-
- io_queue_sqe(req);
- return 0;
-}
-
-/*
- * Batched submission is done, ensure local IO is flushed out.
- */
-static void io_submit_state_end(struct io_ring_ctx *ctx)
-{
- struct io_submit_state *state = &ctx->submit_state;
-
- if (state->link.head)
- io_queue_sqe(state->link.head);
- /* flush only after queuing links as they can generate completions */
- io_submit_flush_completions(ctx);
- if (state->plug_started)
- blk_finish_plug(&state->plug);
-}
-
-/*
- * Start submission side cache.
- */
-static void io_submit_state_start(struct io_submit_state *state,
- unsigned int max_ios)
-{
- state->plug_started = false;
- state->need_plug = max_ios > 2;
- state->submit_nr = max_ios;
- /* set only head, no need to init link_last in advance */
- state->link.head = NULL;
-}
-
-static void io_commit_sqring(struct io_ring_ctx *ctx)
-{
- struct io_rings *rings = ctx->rings;
-
- /*
- * Ensure any loads from the SQEs are done at this point,
- * since once we write the new head, the application could
- * write new data to them.
- */
- smp_store_release(&rings->sq.head, ctx->cached_sq_head);
-}
-
-/*
- * Fetch an sqe, if one is available. Note this returns a pointer to memory
- * that is mapped by userspace. This means that care needs to be taken to
- * ensure that reads are stable, as we cannot rely on userspace always
- * being a good citizen. If members of the sqe are validated and then later
- * used, it's important that those reads are done through READ_ONCE() to
- * prevent a re-load down the line.
- */
-static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
-{
- unsigned head, mask = ctx->sq_entries - 1;
- unsigned sq_idx = ctx->cached_sq_head++ & mask;
-
- /*
- * The cached sq head (or cq tail) serves two purposes:
- *
- * 1) allows us to batch the cost of updating the user visible
- * head updates.
- * 2) allows the kernel side to track the head on its own, even
- * though the application is the one updating it.
- */
- head = READ_ONCE(ctx->sq_array[sq_idx]);
- if (likely(head < ctx->sq_entries))
- return &ctx->sq_sqes[head];
-
- /* drop invalid entries */
- ctx->cq_extra--;
- WRITE_ONCE(ctx->rings->sq_dropped,
- READ_ONCE(ctx->rings->sq_dropped) + 1);
- return NULL;
-}
-
-static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
- __must_hold(&ctx->uring_lock)
-{
- unsigned int entries = io_sqring_entries(ctx);
- int submitted = 0;
-
- if (unlikely(!entries))
- return 0;
- /* make sure SQ entry isn't read before tail */
- nr = min3(nr, ctx->sq_entries, entries);
- io_get_task_refs(nr);
-
- io_submit_state_start(&ctx->submit_state, nr);
- do {
- const struct io_uring_sqe *sqe;
- struct io_kiocb *req;
-
- if (unlikely(!io_alloc_req_refill(ctx))) {
- if (!submitted)
- submitted = -EAGAIN;
- break;
- }
- req = io_alloc_req(ctx);
- sqe = io_get_sqe(ctx);
- if (unlikely(!sqe)) {
- wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
- break;
- }
- /* will complete beyond this point, count as submitted */
- submitted++;
- if (io_submit_sqe(ctx, req, sqe)) {
- /*
- * Continue submitting even for sqe failure if the
- * ring was setup with IORING_SETUP_SUBMIT_ALL
- */
- if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
- break;
- }
- } while (submitted < nr);
-
- if (unlikely(submitted != nr)) {
- int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
- int unused = nr - ref_used;
-
- current->io_uring->cached_refs += unused;
- }
-
- io_submit_state_end(ctx);
- /* Commit SQ ring head once we've consumed and submitted all SQEs */
- io_commit_sqring(ctx);
-
- return submitted;
-}
-
-static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
-{
- return READ_ONCE(sqd->state);
-}
-
-static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
-{
- /* Tell userspace we may need a wakeup call */
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
-{
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
-static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
-{
- unsigned int to_submit;
- int ret = 0;
-
- to_submit = io_sqring_entries(ctx);
- /* if we're handling multiple rings, cap submit size for fairness */
- if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
- to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
-
- if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
- const struct cred *creds = NULL;
-
- if (ctx->sq_creds != current_cred())
- creds = override_creds(ctx->sq_creds);
-
- mutex_lock(&ctx->uring_lock);
- if (!wq_list_empty(&ctx->iopoll_list))
- io_do_iopoll(ctx, true);
-
- /*
- * Don't submit if refs are dying, good for io_uring_register(),
- * but also it is relied upon by io_ring_exit_work()
- */
- if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
- !(ctx->flags & IORING_SETUP_R_DISABLED))
- ret = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
-#ifdef CONFIG_NET_RX_BUSY_POLL
- spin_lock(&ctx->napi_lock);
- if (!list_empty(&ctx->napi_list) &&
- io_napi_busy_loop(&ctx->napi_list))
- ++ret;
- spin_unlock(&ctx->napi_lock);
-#endif
- if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
- wake_up(&ctx->sqo_sq_wait);
- if (creds)
- revert_creds(creds);
- }
-
- return ret;
-}
-
-static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
-{
- struct io_ring_ctx *ctx;
- unsigned sq_thread_idle = 0;
-
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
- sqd->sq_thread_idle = sq_thread_idle;
-}
-
-static bool io_sqd_handle_event(struct io_sq_data *sqd)
-{
- bool did_sig = false;
- struct ksignal ksig;
-
- if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
- signal_pending(current)) {
- mutex_unlock(&sqd->lock);
- if (signal_pending(current))
- did_sig = get_signal(&ksig);
- cond_resched();
- mutex_lock(&sqd->lock);
- }
- return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-}
-
-static int io_sq_thread(void *data)
-{
- struct io_sq_data *sqd = data;
- struct io_ring_ctx *ctx;
- unsigned long timeout = 0;
- char buf[TASK_COMM_LEN];
- DEFINE_WAIT(wait);
-
- snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
- set_task_comm(current, buf);
-
- if (sqd->sq_cpu != -1)
- set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
- else
- set_cpus_allowed_ptr(current, cpu_online_mask);
- current->flags |= PF_NO_SETAFFINITY;
-
- audit_alloc_kernel(current);
-
- mutex_lock(&sqd->lock);
- while (1) {
- bool cap_entries, sqt_spin = false;
-
- if (io_sqd_events_pending(sqd) || signal_pending(current)) {
- if (io_sqd_handle_event(sqd))
- break;
- timeout = jiffies + sqd->sq_thread_idle;
- }
-
- cap_entries = !list_is_singular(&sqd->ctx_list);
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- int ret = __io_sq_thread(ctx, cap_entries);
-
- if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
- sqt_spin = true;
- }
- if (io_run_task_work())
- sqt_spin = true;
-
- if (sqt_spin || !time_after(jiffies, timeout)) {
- cond_resched();
- if (sqt_spin)
- timeout = jiffies + sqd->sq_thread_idle;
- continue;
- }
-
- prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
- bool needs_sched = true;
-
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- io_ring_set_wakeup_flag(ctx);
-
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !wq_list_empty(&ctx->iopoll_list)) {
- needs_sched = false;
- break;
- }
-
- /*
- * Ensure the store of the wakeup flag is not
- * reordered with the load of the SQ tail
- */
- smp_mb();
-
- if (io_sqring_entries(ctx)) {
- needs_sched = false;
- break;
- }
- }
-
- if (needs_sched) {
- mutex_unlock(&sqd->lock);
- schedule();
- mutex_lock(&sqd->lock);
- }
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_clear_wakeup_flag(ctx);
- }
-
- finish_wait(&sqd->wait, &wait);
- timeout = jiffies + sqd->sq_thread_idle;
- }
-
- io_uring_cancel_generic(true, sqd);
- sqd->thread = NULL;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_set_wakeup_flag(ctx);
- io_run_task_work();
- mutex_unlock(&sqd->lock);
-
- audit_free(current);
-
- complete(&sqd->exited);
- do_exit(0);
-}
-
-struct io_wait_queue {
- struct wait_queue_entry wq;
- struct io_ring_ctx *ctx;
- unsigned cq_tail;
- unsigned nr_timeouts;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- unsigned busy_poll_to;
-#endif
-};
-
-static inline bool io_should_wake(struct io_wait_queue *iowq)
-{
- struct io_ring_ctx *ctx = iowq->ctx;
- int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
-
- /*
- * Wake up if we have enough events, or if a timeout occurred since we
- * started waiting. For timeouts, we always want to return to userspace,
- * regardless of event count.
- */
- return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
-}
-
-static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
- int wake_flags, void *key)
-{
- struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
- wq);
-
- /*
- * Cannot safely flush overflowed CQEs from here, ensure we wake up
- * the task, and the next invocation will do it.
- */
- if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
- return autoremove_wake_function(curr, mode, wake_flags, key);
- return -1;
-}
-
-static int io_run_task_work_sig(void)
-{
- if (io_run_task_work())
- return 1;
- if (test_thread_flag(TIF_NOTIFY_SIGNAL))
- return -ERESTARTSYS;
- if (task_sigpending(current))
- return -EINTR;
- return 0;
-}
-
-/* when returns >0, the caller should retry */
-static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
- struct io_wait_queue *iowq,
- ktime_t timeout)
-{
- int ret;
-
- /* make sure we run task_work before checking for signals */
- ret = io_run_task_work_sig();
- if (ret || io_should_wake(iowq))
- return ret;
- /* let the caller flush overflows, retry */
- if (test_bit(0, &ctx->check_cq_overflow))
- return 1;
-
- if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
- return -ETIME;
- return 1;
-}
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
-static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
- struct io_wait_queue *iowq)
-{
- unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
- struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to);
-
- if (timespec64_compare(ts, &pollto) > 0) {
- *ts = timespec64_sub(*ts, pollto);
- iowq->busy_poll_to = busy_poll_to;
- } else {
- u64 to = timespec64_to_ns(ts);
-
- do_div(to, 1000);
- iowq->busy_poll_to = to;
- ts->tv_sec = 0;
- ts->tv_nsec = 0;
- }
-}
-
-static inline bool io_busy_loop_timeout(unsigned long start_time,
- unsigned long bp_usec)
-{
- if (bp_usec) {
- unsigned long end_time = start_time + bp_usec;
- unsigned long now = busy_loop_current_time();
-
- return time_after(now, end_time);
- }
- return true;
-}
-
-static bool io_busy_loop_end(void *p, unsigned long start_time)
-{
- struct io_wait_queue *iowq = p;
-
- return signal_pending(current) ||
- io_should_wake(iowq) ||
- io_busy_loop_timeout(start_time, iowq->busy_poll_to);
-}
-
-static void io_blocking_napi_busy_loop(struct list_head *napi_list,
- struct io_wait_queue *iowq)
-{
- unsigned long start_time =
- list_is_singular(napi_list) ? 0 :
- busy_loop_current_time();
-
- do {
- if (list_is_singular(napi_list)) {
- struct napi_entry *ne =
- list_first_entry(napi_list,
- struct napi_entry, list);
-
- napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq,
- true, BUSY_POLL_BUDGET);
- io_check_napi_entry_timeout(ne);
- break;
- }
- } while (io_napi_busy_loop(napi_list) &&
- !io_busy_loop_end(iowq, start_time));
-}
-
-static void io_putback_napi_list(struct io_ring_ctx *ctx,
- struct list_head *napi_list)
-{
- struct napi_entry *cne, *lne;
-
- spin_lock(&ctx->napi_lock);
- list_for_each_entry(cne, &ctx->napi_list, list)
- list_for_each_entry(lne, napi_list, list)
- if (cne->napi_id == lne->napi_id) {
- list_del(&lne->list);
- kfree(lne);
- break;
- }
- list_splice(napi_list, &ctx->napi_list);
- spin_unlock(&ctx->napi_lock);
-}
-#endif /* CONFIG_NET_RX_BUSY_POLL */
-
-/*
- * Wait until events become available, if we don't already have some. The
- * application must reap them itself, as they reside on the shared cq ring.
- */
-static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- const sigset_t __user *sig, size_t sigsz,
- struct __kernel_timespec __user *uts)
-{
- struct io_wait_queue iowq;
- struct io_rings *rings = ctx->rings;
- ktime_t timeout = KTIME_MAX;
- int ret;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- LIST_HEAD(local_napi_list);
-#endif
-
- do {
- io_cqring_overflow_flush(ctx);
- if (io_cqring_events(ctx) >= min_events)
- return 0;
- if (!io_run_task_work())
- break;
- } while (1);
-
- if (sig) {
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall())
- ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- sigsz);
- else
-#endif
- ret = set_user_sigmask(sig, sigsz);
-
- if (ret)
- return ret;
- }
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
- iowq.busy_poll_to = 0;
- if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
- spin_lock(&ctx->napi_lock);
- list_splice_init(&ctx->napi_list, &local_napi_list);
- spin_unlock(&ctx->napi_lock);
- }
-#endif
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- if (!list_empty(&local_napi_list))
- io_adjust_busy_loop_timeout(&ts, &iowq);
-#endif
- timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
- }
-#ifdef CONFIG_NET_RX_BUSY_POLL
- else if (!list_empty(&local_napi_list))
- iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
-#endif
-
- init_waitqueue_func_entry(&iowq.wq, io_wake_function);
- iowq.wq.private = current;
- INIT_LIST_HEAD(&iowq.wq.entry);
- iowq.ctx = ctx;
- iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
- iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
-
- trace_io_uring_cqring_wait(ctx, min_events);
-#ifdef CONFIG_NET_RX_BUSY_POLL
- if (iowq.busy_poll_to)
- io_blocking_napi_busy_loop(&local_napi_list, &iowq);
- if (!list_empty(&local_napi_list))
- io_putback_napi_list(ctx, &local_napi_list);
-#endif
- do {
- /* if we can't even flush overflow, don't wait for more */
- if (!io_cqring_overflow_flush(ctx)) {
- ret = -EBUSY;
- break;
- }
- prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
- TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
- finish_wait(&ctx->cq_wait, &iowq.wq);
- cond_resched();
- } while (ret > 0);
-
- restore_saved_sigmask_unless(ret == -EINTR);
-
- return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
-}
-
-static void io_free_page_table(void **table, size_t size)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-
- for (i = 0; i < nr_tables; i++)
- kfree(table[i]);
- kfree(table);
-}
-
-static __cold void **io_alloc_page_table(size_t size)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
- size_t init_size = size;
- void **table;
-
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
- if (!table)
- return NULL;
-
- for (i = 0; i < nr_tables; i++) {
- unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
-
- table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
- if (!table[i]) {
- io_free_page_table(table, init_size);
- return NULL;
- }
- size -= this_size;
- }
- return table;
-}
-
-static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
-{
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
-}
-
-static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
-{
- struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
- struct io_ring_ctx *ctx = node->rsrc_data->ctx;
- unsigned long flags;
- bool first_add = false;
- unsigned long delay = HZ;
-
- spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
- node->done = true;
-
- /* if we are mid-quiesce then do not delay */
- if (node->rsrc_data->quiesce)
- delay = 0;
-
- while (!list_empty(&ctx->rsrc_ref_list)) {
- node = list_first_entry(&ctx->rsrc_ref_list,
- struct io_rsrc_node, node);
- /* recycle ref nodes in order */
- if (!node->done)
- break;
- list_del(&node->node);
- first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
- }
- spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
-
- if (first_add)
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
-}
-
-static struct io_rsrc_node *io_rsrc_node_alloc(void)
-{
- struct io_rsrc_node *ref_node;
-
- ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
- if (!ref_node)
- return NULL;
-
- if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
- 0, GFP_KERNEL)) {
- kfree(ref_node);
- return NULL;
- }
- INIT_LIST_HEAD(&ref_node->node);
- INIT_LIST_HEAD(&ref_node->rsrc_list);
- ref_node->done = false;
- return ref_node;
-}
-
-static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
- struct io_rsrc_data *data_to_kill)
- __must_hold(&ctx->uring_lock)
-{
- WARN_ON_ONCE(!ctx->rsrc_backup_node);
- WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
-
- io_rsrc_refs_drop(ctx);
-
- if (data_to_kill) {
- struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
-
- rsrc_node->rsrc_data = data_to_kill;
- spin_lock_irq(&ctx->rsrc_ref_lock);
- list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
- spin_unlock_irq(&ctx->rsrc_ref_lock);
-
- atomic_inc(&data_to_kill->refs);
- percpu_ref_kill(&rsrc_node->refs);
- ctx->rsrc_node = NULL;
- }
-
- if (!ctx->rsrc_node) {
- ctx->rsrc_node = ctx->rsrc_backup_node;
- ctx->rsrc_backup_node = NULL;
- }
-}
-
-static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
-{
- if (ctx->rsrc_backup_node)
- return 0;
- ctx->rsrc_backup_node = io_rsrc_node_alloc();
- return ctx->rsrc_backup_node ? 0 : -ENOMEM;
-}
-
-static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
- struct io_ring_ctx *ctx)
-{
- int ret;
-
- /* As we may drop ->uring_lock, other task may have started quiesce */
- if (data->quiesce)
- return -ENXIO;
-
- data->quiesce = true;
- do {
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- break;
- io_rsrc_node_switch(ctx, data);
-
- /* kill initial ref, already quiesced if zero */
- if (atomic_dec_and_test(&data->refs))
- break;
- mutex_unlock(&ctx->uring_lock);
- flush_delayed_work(&ctx->rsrc_put_work);
- ret = wait_for_completion_interruptible(&data->done);
- if (!ret) {
- mutex_lock(&ctx->uring_lock);
- if (atomic_read(&data->refs) > 0) {
- /*
- * it has been revived by another thread while
- * we were unlocked
- */
- mutex_unlock(&ctx->uring_lock);
- } else {
- break;
- }
- }
-
- atomic_inc(&data->refs);
- /* wait for all works potentially completing data->done */
- flush_delayed_work(&ctx->rsrc_put_work);
- reinit_completion(&data->done);
-
- ret = io_run_task_work_sig();
- mutex_lock(&ctx->uring_lock);
- } while (ret >= 0);
- data->quiesce = false;
-
- return ret;
-}
-
-static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
-{
- unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
- unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
-
- return &data->tags[table_idx][off];
-}
-
-static void io_rsrc_data_free(struct io_rsrc_data *data)
-{
- size_t size = data->nr * sizeof(data->tags[0][0]);
-
- if (data->tags)
- io_free_page_table((void **)data->tags, size);
- kfree(data);
-}
-
-static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
- u64 __user *utags, unsigned nr,
- struct io_rsrc_data **pdata)
-{
- struct io_rsrc_data *data;
- int ret = -ENOMEM;
- unsigned i;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
- data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
- if (!data->tags) {
- kfree(data);
- return -ENOMEM;
- }
-
- data->nr = nr;
- data->ctx = ctx;
- data->do_put = do_put;
- if (utags) {
- ret = -EFAULT;
- for (i = 0; i < nr; i++) {
- u64 *tag_slot = io_get_tag_slot(data, i);
-
- if (copy_from_user(tag_slot, &utags[i],
- sizeof(*tag_slot)))
- goto fail;
- }
- }
-
- atomic_set(&data->refs, 1);
- init_completion(&data->done);
- *pdata = data;
- return 0;
-fail:
- io_rsrc_data_free(data);
- return ret;
-}
-
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
- table->files = kvcalloc(nr_files, sizeof(table->files[0]),
- GFP_KERNEL_ACCOUNT);
- return !!table->files;
-}
-
-static void io_free_file_tables(struct io_file_table *table)
-{
- kvfree(table->files);
- table->files = NULL;
-}
-
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
- kfree_skb(skb);
- }
-#else
- int i;
-
- for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file;
-
- file = io_file_from_index(ctx, i);
- if (file)
- fput(file);
- }
-#endif
- io_free_file_tables(&ctx->file_table);
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
- ctx->nr_user_files = 0;
-}
-
-static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
- int ret;
-
- if (!ctx->file_data)
- return -ENXIO;
- ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
- if (!ret)
- __io_sqe_files_unregister(ctx);
- return ret;
-}
-
-static void io_sq_thread_unpark(struct io_sq_data *sqd)
- __releases(&sqd->lock)
-{
- WARN_ON_ONCE(sqd->thread == current);
-
- /*
- * Do the dance but not conditional clear_bit() because it'd race with
- * other threads incrementing park_pending and setting the bit.
- */
- clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- if (atomic_dec_return(&sqd->park_pending))
- set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- mutex_unlock(&sqd->lock);
-}
-
-static void io_sq_thread_park(struct io_sq_data *sqd)
- __acquires(&sqd->lock)
-{
- WARN_ON_ONCE(sqd->thread == current);
-
- atomic_inc(&sqd->park_pending);
- set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
-}
-
-static void io_sq_thread_stop(struct io_sq_data *sqd)
-{
- WARN_ON_ONCE(sqd->thread == current);
- WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
-
- set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
- mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
- mutex_unlock(&sqd->lock);
- wait_for_completion(&sqd->exited);
-}
-
-static void io_put_sq_data(struct io_sq_data *sqd)
-{
- if (refcount_dec_and_test(&sqd->refs)) {
- WARN_ON_ONCE(atomic_read(&sqd->park_pending));
-
- io_sq_thread_stop(sqd);
- kfree(sqd);
- }
-}
-
-static void io_sq_thread_finish(struct io_ring_ctx *ctx)
-{
- struct io_sq_data *sqd = ctx->sq_data;
-
- if (sqd) {
- io_sq_thread_park(sqd);
- list_del_init(&ctx->sqd_list);
- io_sqd_update_thread_idle(sqd);
- io_sq_thread_unpark(sqd);
-
- io_put_sq_data(sqd);
- ctx->sq_data = NULL;
- }
-}
-
-static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
-{
- struct io_ring_ctx *ctx_attach;
- struct io_sq_data *sqd;
- struct fd f;
-
- f = fdget(p->wq_fd);
- if (!f.file)
- return ERR_PTR(-ENXIO);
- if (f.file->f_op != &io_uring_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
-
- ctx_attach = f.file->private_data;
- sqd = ctx_attach->sq_data;
- if (!sqd) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
- if (sqd->task_tgid != current->tgid) {
- fdput(f);
- return ERR_PTR(-EPERM);
- }
-
- refcount_inc(&sqd->refs);
- fdput(f);
- return sqd;
-}
-
-static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
- bool *attached)
-{
- struct io_sq_data *sqd;
-
- *attached = false;
- if (p->flags & IORING_SETUP_ATTACH_WQ) {
- sqd = io_attach_sq_data(p);
- if (!IS_ERR(sqd)) {
- *attached = true;
- return sqd;
- }
- /* fall through for EPERM case, setup new sqd/task */
- if (PTR_ERR(sqd) != -EPERM)
- return sqd;
- }
-
- sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
- if (!sqd)
- return ERR_PTR(-ENOMEM);
-
- atomic_set(&sqd->park_pending, 0);
- refcount_set(&sqd->refs, 1);
- INIT_LIST_HEAD(&sqd->ctx_list);
- mutex_init(&sqd->lock);
- init_waitqueue_head(&sqd->wait);
- init_completion(&sqd->exited);
- return sqd;
-}
-
-#if defined(CONFIG_UNIX)
-/*
- * Ensure the UNIX gc is aware of our file set, so we are certain that
- * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing.
- */
-static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
-{
- struct sock *sk = ctx->ring_sock->sk;
- struct scm_fp_list *fpl;
- struct sk_buff *skb;
- int i, nr_files;
-
- fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
- if (!fpl)
- return -ENOMEM;
-
- skb = alloc_skb(0, GFP_KERNEL);
- if (!skb) {
- kfree(fpl);
- return -ENOMEM;
- }
-
- skb->sk = sk;
-
- nr_files = 0;
- fpl->user = get_uid(current_user());
- for (i = 0; i < nr; i++) {
- struct file *file = io_file_from_index(ctx, i + offset);
-
- if (!file)
- continue;
- fpl->fp[nr_files] = get_file(file);
- unix_inflight(fpl->user, fpl->fp[nr_files]);
- nr_files++;
- }
-
- if (nr_files) {
- fpl->max = SCM_MAX_FD;
- fpl->count = nr_files;
- UNIXCB(skb).fp = fpl;
- skb->destructor = unix_destruct_scm;
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_queue_head(&sk->sk_receive_queue, skb);
-
- for (i = 0; i < nr_files; i++)
- fput(fpl->fp[i]);
- } else {
- kfree_skb(skb);
- free_uid(fpl->user);
- kfree(fpl);
- }
-
- return 0;
-}
-
-/*
- * If UNIX sockets are enabled, fd passing can cause a reference cycle which
- * causes regular reference counting to break down. We rely on the UNIX
- * garbage collection to take care of this problem for us.
- */
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
- unsigned left, total;
- int ret = 0;
-
- total = 0;
- left = ctx->nr_user_files;
- while (left) {
- unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
-
- ret = __io_sqe_files_scm(ctx, this_files, total);
- if (ret)
- break;
- left -= this_files;
- total += this_files;
- }
-
- if (!ret)
- return 0;
-
- while (total < ctx->nr_user_files) {
- struct file *file = io_file_from_index(ctx, total);
-
- if (file)
- fput(file);
- total++;
- }
-
- return ret;
-}
-#else
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
- return 0;
-}
-#endif
-
-static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
- struct file *file = prsrc->file;
-#if defined(CONFIG_UNIX)
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff_head list, *head = &sock->sk_receive_queue;
- struct sk_buff *skb;
- int i;
-
- __skb_queue_head_init(&list);
-
- /*
- * Find the skb that holds this file in its SCM_RIGHTS. When found,
- * remove this entry and rearrange the file array.
- */
- skb = skb_dequeue(head);
- while (skb) {
- struct scm_fp_list *fp;
-
- fp = UNIXCB(skb).fp;
- for (i = 0; i < fp->count; i++) {
- int left;
-
- if (fp->fp[i] != file)
- continue;
-
- unix_notinflight(fp->user, fp->fp[i]);
- left = fp->count - 1 - i;
- if (left) {
- memmove(&fp->fp[i], &fp->fp[i + 1],
- left * sizeof(struct file *));
- }
- fp->count--;
- if (!fp->count) {
- kfree_skb(skb);
- skb = NULL;
- } else {
- __skb_queue_tail(&list, skb);
- }
- fput(file);
- file = NULL;
- break;
- }
-
- if (!file)
- break;
-
- __skb_queue_tail(&list, skb);
-
- skb = skb_dequeue(head);
- }
-
- if (skb_peek(&list)) {
- spin_lock_irq(&head->lock);
- while ((skb = __skb_dequeue(&list)) != NULL)
- __skb_queue_tail(head, skb);
- spin_unlock_irq(&head->lock);
- }
-#else
- fput(file);
-#endif
-}
-
-static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
-{
- struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
- struct io_ring_ctx *ctx = rsrc_data->ctx;
- struct io_rsrc_put *prsrc, *tmp;
-
- list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
- list_del(&prsrc->list);
-
- if (prsrc->tag) {
- bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
-
- io_ring_submit_lock(ctx, lock_ring);
- spin_lock(&ctx->completion_lock);
- io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
- io_ring_submit_unlock(ctx, lock_ring);
- }
-
- rsrc_data->do_put(ctx, prsrc);
- kfree(prsrc);
- }
-
- io_rsrc_node_destroy(ref_node);
- if (atomic_dec_and_test(&rsrc_data->refs))
- complete(&rsrc_data->done);
-}
-
-static void io_rsrc_put_work(struct work_struct *work)
-{
- struct io_ring_ctx *ctx;
- struct llist_node *node;
-
- ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
- node = llist_del_all(&ctx->rsrc_put_llist);
-
- while (node) {
- struct io_rsrc_node *ref_node;
- struct llist_node *next = node->next;
-
- ref_node = llist_entry(node, struct io_rsrc_node, llist);
- __io_rsrc_put_work(ref_node);
- node = next;
- }
-}
-
-static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args, u64 __user *tags)
-{
- __s32 __user *fds = (__s32 __user *) arg;
- struct file *file;
- int fd, ret;
- unsigned i;
-
- if (ctx->file_data)
- return -EBUSY;
- if (!nr_args)
- return -EINVAL;
- if (nr_args > IORING_MAX_FIXED_FILES)
- return -EMFILE;
- if (nr_args > rlimit(RLIMIT_NOFILE))
- return -EMFILE;
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- return ret;
- ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
- &ctx->file_data);
- if (ret)
- return ret;
-
- ret = -ENOMEM;
- if (!io_alloc_file_tables(&ctx->file_table, nr_args))
- goto out_free;
-
- for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
- ret = -EFAULT;
- goto out_fput;
- }
- /* allow sparse sets */
- if (fd == -1) {
- ret = -EINVAL;
- if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
- goto out_fput;
- continue;
- }
-
- file = fget(fd);
- ret = -EBADF;
- if (unlikely(!file))
- goto out_fput;
-
- /*
- * Don't allow io_uring instances to be registered. If UNIX
- * isn't enabled, then this causes a reference cycle and this
- * instance can never get freed. If UNIX is enabled we'll
- * handle it just fine, but there's still no point in allowing
- * a ring fd as it doesn't support regular read/write anyway.
- */
- if (file->f_op == &io_uring_fops) {
- fput(file);
- goto out_fput;
- }
- io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
- }
-
- ret = io_sqe_files_scm(ctx);
- if (ret) {
- __io_sqe_files_unregister(ctx);
- return ret;
- }
-
- io_rsrc_node_switch(ctx, NULL);
- return ret;
-out_fput:
- for (i = 0; i < ctx->nr_user_files; i++) {
- file = io_file_from_index(ctx, i);
- if (file)
- fput(file);
- }
- io_free_file_tables(&ctx->file_table);
- ctx->nr_user_files = 0;
-out_free:
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
- return ret;
-}
-
-static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
- int index)
-{
-#if defined(CONFIG_UNIX)
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff_head *head = &sock->sk_receive_queue;
- struct sk_buff *skb;
-
- /*
- * See if we can merge this file into an existing skb SCM_RIGHTS
- * file set. If there's no room, fall back to allocating a new skb
- * and filling it in.
- */
- spin_lock_irq(&head->lock);
- skb = skb_peek(head);
- if (skb) {
- struct scm_fp_list *fpl = UNIXCB(skb).fp;
-
- if (fpl->count < SCM_MAX_FD) {
- __skb_unlink(skb, head);
- spin_unlock_irq(&head->lock);
- fpl->fp[fpl->count] = get_file(file);
- unix_inflight(fpl->user, fpl->fp[fpl->count]);
- fpl->count++;
- spin_lock_irq(&head->lock);
- __skb_queue_head(head, skb);
- } else {
- skb = NULL;
- }
- }
- spin_unlock_irq(&head->lock);
-
- if (skb) {
- fput(file);
- return 0;
- }
-
- return __io_sqe_files_scm(ctx, 1, index);
-#else
- return 0;
-#endif
-}
-
-static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
- struct io_rsrc_node *node, void *rsrc)
-{
- struct io_rsrc_put *prsrc;
-
- prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
- if (!prsrc)
- return -ENOMEM;
-
- prsrc->tag = *io_get_tag_slot(data, idx);
- prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &node->rsrc_list);
- return 0;
-}
-
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
- unsigned int issue_flags, u32 slot_index)
-{
- struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- bool needs_switch = false;
- struct io_fixed_file *file_slot;
- int ret = -EBADF;
-
- io_ring_submit_lock(ctx, needs_lock);
- if (file->f_op == &io_uring_fops)
- goto err;
- ret = -ENXIO;
- if (!ctx->file_data)
- goto err;
- ret = -EINVAL;
- if (slot_index >= ctx->nr_user_files)
- goto err;
-
- slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
- if (file_slot->file_ptr) {
- struct file *old_file;
-
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- goto err;
-
- old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
- ctx->rsrc_node, old_file);
- if (ret)
- goto err;
- file_slot->file_ptr = 0;
- needs_switch = true;
- }
-
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
- ret = io_sqe_file_register(ctx, file, slot_index);
- if (ret) {
- file_slot->file_ptr = 0;
- goto err;
- }
-
- ret = 0;
-err:
- if (needs_switch)
- io_rsrc_node_switch(ctx, ctx->file_data);
- io_ring_submit_unlock(ctx, needs_lock);
- if (ret)
- fput(file);
- return ret;
-}
-
-static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
-{
- unsigned int offset = req->close.file_slot - 1;
- struct io_ring_ctx *ctx = req->ctx;
- bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
- struct io_fixed_file *file_slot;
- struct file *file;
- int ret, i;
-
- io_ring_submit_lock(ctx, needs_lock);
- ret = -ENXIO;
- if (unlikely(!ctx->file_data))
- goto out;
- ret = -EINVAL;
- if (offset >= ctx->nr_user_files)
- goto out;
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- goto out;
-
- i = array_index_nospec(offset, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
- ret = -EBADF;
- if (!file_slot->file_ptr)
- goto out;
-
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
- if (ret)
- goto out;
-
- file_slot->file_ptr = 0;
- io_rsrc_node_switch(ctx, ctx->file_data);
- ret = 0;
-out:
- io_ring_submit_unlock(ctx, needs_lock);
- return ret;
-}
-
-static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_rsrc_update2 *up,
- unsigned nr_args)
-{
- u64 __user *tags = u64_to_user_ptr(up->tags);
- __s32 __user *fds = u64_to_user_ptr(up->data);
- struct io_rsrc_data *data = ctx->file_data;
- struct io_fixed_file *file_slot;
- struct file *file;
- int fd, i, err = 0;
- unsigned int done;
- bool needs_switch = false;
-
- if (!ctx->file_data)
- return -ENXIO;
- if (up->offset + nr_args > ctx->nr_user_files)
- return -EINVAL;
-
- for (done = 0; done < nr_args; done++) {
- u64 tag = 0;
-
- if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
- copy_from_user(&fd, &fds[done], sizeof(fd))) {
- err = -EFAULT;
- break;
- }
- if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
- err = -EINVAL;
- break;
- }
- if (fd == IORING_REGISTER_FILES_SKIP)
- continue;
-
- i = array_index_nospec(up->offset + done, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
- if (file_slot->file_ptr) {
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- err = io_queue_rsrc_removal(data, up->offset + done,
- ctx->rsrc_node, file);
- if (err)
- break;
- file_slot->file_ptr = 0;
- needs_switch = true;
- }
- if (fd != -1) {
- file = fget(fd);
- if (!file) {
- err = -EBADF;
- break;
- }
- /*
- * Don't allow io_uring instances to be registered. If
- * UNIX isn't enabled, then this causes a reference
- * cycle and this instance can never get freed. If UNIX
- * is enabled we'll handle it just fine, but there's
- * still no point in allowing a ring fd as it doesn't
- * support regular read/write anyway.
- */
- if (file->f_op == &io_uring_fops) {
- fput(file);
- err = -EBADF;
- break;
- }
- *io_get_tag_slot(data, up->offset + done) = tag;
- io_fixed_file_set(file_slot, file);
- err = io_sqe_file_register(ctx, file, i);
- if (err) {
- file_slot->file_ptr = 0;
- fput(file);
- break;
- }
- }
- }
-
- if (needs_switch)
- io_rsrc_node_switch(ctx, data);
- return done ? done : err;
-}
-
-static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
- struct task_struct *task)
-{
- struct io_wq_hash *hash;
- struct io_wq_data data;
- unsigned int concurrency;
-
- mutex_lock(&ctx->uring_lock);
- hash = ctx->hash_map;
- if (!hash) {
- hash = kzalloc(sizeof(*hash), GFP_KERNEL);
- if (!hash) {
- mutex_unlock(&ctx->uring_lock);
- return ERR_PTR(-ENOMEM);
- }
- refcount_set(&hash->refs, 1);
- init_waitqueue_head(&hash->wait);
- ctx->hash_map = hash;
- }
- mutex_unlock(&ctx->uring_lock);
-
- data.hash = hash;
- data.task = task;
- data.free_work = io_wq_free_work;
- data.do_work = io_wq_submit_work;
-
- /* Do QD, or 4 * CPUS, whatever is smallest */
- concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-
- return io_wq_create(concurrency, &data);
-}
-
-static __cold int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx;
- int ret;
-
- tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
- if (unlikely(!tctx))
- return -ENOMEM;
-
- tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
- sizeof(struct file *), GFP_KERNEL);
- if (unlikely(!tctx->registered_rings)) {
- kfree(tctx);
- return -ENOMEM;
- }
-
- ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
- if (unlikely(ret)) {
- kfree(tctx->registered_rings);
- kfree(tctx);
- return ret;
- }
-
- tctx->io_wq = io_init_wq_offload(ctx, task);
- if (IS_ERR(tctx->io_wq)) {
- ret = PTR_ERR(tctx->io_wq);
- percpu_counter_destroy(&tctx->inflight);
- kfree(tctx->registered_rings);
- kfree(tctx);
- return ret;
- }
-
- xa_init(&tctx->xa);
- init_waitqueue_head(&tctx->wait);
- atomic_set(&tctx->in_idle, 0);
- atomic_set(&tctx->inflight_tracked, 0);
- task->io_uring = tctx;
- spin_lock_init(&tctx->task_lock);
- INIT_WQ_LIST(&tctx->task_list);
- INIT_WQ_LIST(&tctx->prior_task_list);
- init_task_work(&tctx->task_work, tctx_task_work);
- return 0;
-}
-
-void __io_uring_free(struct task_struct *tsk)
-{
- struct io_uring_task *tctx = tsk->io_uring;
-
- WARN_ON_ONCE(!xa_empty(&tctx->xa));
- WARN_ON_ONCE(tctx->io_wq);
- WARN_ON_ONCE(tctx->cached_refs);
-
- kfree(tctx->registered_rings);
- percpu_counter_destroy(&tctx->inflight);
- kfree(tctx);
- tsk->io_uring = NULL;
-}
-
-static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
-{
- int ret;
-
- /* Retain compatibility with failing for an invalid attach attempt */
- if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
- IORING_SETUP_ATTACH_WQ) {
- struct fd f;
-
- f = fdget(p->wq_fd);
- if (!f.file)
- return -ENXIO;
- if (f.file->f_op != &io_uring_fops) {
- fdput(f);
- return -EINVAL;
- }
- fdput(f);
- }
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- struct task_struct *tsk;
- struct io_sq_data *sqd;
- bool attached;
-
- ret = security_uring_sqpoll();
- if (ret)
- return ret;
-
- sqd = io_get_sq_data(p, &attached);
- if (IS_ERR(sqd)) {
- ret = PTR_ERR(sqd);
- goto err;
- }
-
- ctx->sq_creds = get_current_cred();
- ctx->sq_data = sqd;
- ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
- if (!ctx->sq_thread_idle)
- ctx->sq_thread_idle = HZ;
-
- io_sq_thread_park(sqd);
- list_add(&ctx->sqd_list, &sqd->ctx_list);
- io_sqd_update_thread_idle(sqd);
- /* don't attach to a dying SQPOLL thread, would be racy */
- ret = (attached && !sqd->thread) ? -ENXIO : 0;
- io_sq_thread_unpark(sqd);
-
- if (ret < 0)
- goto err;
- if (attached)
- return 0;
-
- if (p->flags & IORING_SETUP_SQ_AFF) {
- int cpu = p->sq_thread_cpu;
-
- ret = -EINVAL;
- if (cpu >= nr_cpu_ids || !cpu_online(cpu))
- goto err_sqpoll;
- sqd->sq_cpu = cpu;
- } else {
- sqd->sq_cpu = -1;
- }
-
- sqd->task_pid = current->pid;
- sqd->task_tgid = current->tgid;
- tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
- if (IS_ERR(tsk)) {
- ret = PTR_ERR(tsk);
- goto err_sqpoll;
- }
-
- sqd->thread = tsk;
- ret = io_uring_alloc_task_context(tsk, ctx);
- wake_up_new_task(tsk);
- if (ret)
- goto err;
- } else if (p->flags & IORING_SETUP_SQ_AFF) {
- /* Can't have SQ_AFF without SQPOLL */
- ret = -EINVAL;
- goto err;
- }
-
- return 0;
-err_sqpoll:
- complete(&ctx->sq_data->exited);
-err:
- io_sq_thread_finish(ctx);
- return ret;
-}
-
-static inline void __io_unaccount_mem(struct user_struct *user,
- unsigned long nr_pages)
-{
- atomic_long_sub(nr_pages, &user->locked_vm);
-}
-
-static inline int __io_account_mem(struct user_struct *user,
- unsigned long nr_pages)
-{
- unsigned long page_limit, cur_pages, new_pages;
-
- /* Don't allow more pages than we can safely lock */
- page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- do {
- cur_pages = atomic_long_read(&user->locked_vm);
- new_pages = cur_pages + nr_pages;
- if (new_pages > page_limit)
- return -ENOMEM;
- } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
- new_pages) != cur_pages);
-
- return 0;
-}
-
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
- if (ctx->user)
- __io_unaccount_mem(ctx->user, nr_pages);
-
- if (ctx->mm_account)
- atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
-}
-
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
-{
- int ret;
-
- if (ctx->user) {
- ret = __io_account_mem(ctx->user, nr_pages);
- if (ret)
- return ret;
- }
-
- if (ctx->mm_account)
- atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
-
- return 0;
-}
-
-static void io_mem_free(void *ptr)
-{
- struct page *page;
-
- if (!ptr)
- return;
-
- page = virt_to_head_page(ptr);
- if (put_page_testzero(page))
- free_compound_page(page);
-}
-
-static void *io_mem_alloc(size_t size)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
-
- return (void *) __get_free_pages(gfp, get_order(size));
-}
-
-static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
- size_t *sq_offset)
-{
- struct io_rings *rings;
- size_t off, sq_array_size;
-
- off = struct_size(rings, cqes, cq_entries);
- if (off == SIZE_MAX)
- return SIZE_MAX;
-
-#ifdef CONFIG_SMP
- off = ALIGN(off, SMP_CACHE_BYTES);
- if (off == 0)
- return SIZE_MAX;
-#endif
-
- if (sq_offset)
- *sq_offset = off;
-
- sq_array_size = array_size(sizeof(u32), sq_entries);
- if (sq_array_size == SIZE_MAX)
- return SIZE_MAX;
-
- if (check_add_overflow(off, sq_array_size, &off))
- return SIZE_MAX;
-
- return off;
-}
-
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
-{
- struct io_mapped_ubuf *imu = *slot;
- unsigned int i;
-
- if (imu != ctx->dummy_ubuf) {
- for (i = 0; i < imu->nr_bvecs; i++)
- unpin_user_page(imu->bvec[i].bv_page);
- if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages);
- kvfree(imu);
- }
- *slot = NULL;
-}
-
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
- io_buffer_unmap(ctx, &prsrc->buf);
- prsrc->buf = NULL;
-}
-
-static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
- unsigned int i;
-
- for (i = 0; i < ctx->nr_user_bufs; i++)
- io_buffer_unmap(ctx, &ctx->user_bufs[i]);
- kfree(ctx->user_bufs);
- io_rsrc_data_free(ctx->buf_data);
- ctx->user_bufs = NULL;
- ctx->buf_data = NULL;
- ctx->nr_user_bufs = 0;
-}
-
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
- int ret;
-
- if (!ctx->buf_data)
- return -ENXIO;
-
- ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
- if (!ret)
- __io_sqe_buffers_unregister(ctx);
- return ret;
-}
-
-static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
- void __user *arg, unsigned index)
-{
- struct iovec __user *src;
-
-#ifdef CONFIG_COMPAT
- if (ctx->compat) {
- struct compat_iovec __user *ciovs;
- struct compat_iovec ciov;
-
- ciovs = (struct compat_iovec __user *) arg;
- if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
- return -EFAULT;
-
- dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
- dst->iov_len = ciov.iov_len;
- return 0;
- }
-#endif
- src = (struct iovec __user *) arg;
- if (copy_from_user(dst, &src[index], sizeof(*dst)))
- return -EFAULT;
- return 0;
-}
-
-/*
- * Not super efficient, but this is just a registration time. And we do cache
- * the last compound head, so generally we'll only do a full search if we don't
- * match that one.
- *
- * We check if the given compound head page has already been accounted, to
- * avoid double accounting it. This allows us to account the full size of the
- * page, not just the constituent pages of a huge page.
- */
-static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
- int nr_pages, struct page *hpage)
-{
- int i, j;
-
- /* check current page array */
- for (i = 0; i < nr_pages; i++) {
- if (!PageCompound(pages[i]))
- continue;
- if (compound_head(pages[i]) == hpage)
- return true;
- }
-
- /* check previously registered pages */
- for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = ctx->user_bufs[i];
-
- for (j = 0; j < imu->nr_bvecs; j++) {
- if (!PageCompound(imu->bvec[j].bv_page))
- continue;
- if (compound_head(imu->bvec[j].bv_page) == hpage)
- return true;
- }
- }
-
- return false;
-}
-
-static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
- int nr_pages, struct io_mapped_ubuf *imu,
- struct page **last_hpage)
-{
- int i, ret;
-
- imu->acct_pages = 0;
- for (i = 0; i < nr_pages; i++) {
- if (!PageCompound(pages[i])) {
- imu->acct_pages++;
- } else {
- struct page *hpage;
-
- hpage = compound_head(pages[i]);
- if (hpage == *last_hpage)
- continue;
- *last_hpage = hpage;
- if (headpage_already_acct(ctx, pages, i, hpage))
- continue;
- imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
- }
- }
-
- if (!imu->acct_pages)
- return 0;
-
- ret = io_account_mem(ctx, imu->acct_pages);
- if (ret)
- imu->acct_pages = 0;
- return ret;
-}
-
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf **pimu,
- struct page **last_hpage)
-{
- struct io_mapped_ubuf *imu = NULL;
- struct vm_area_struct **vmas = NULL;
- struct page **pages = NULL;
- unsigned long off, start, end, ubuf;
- size_t size;
- int ret, pret, nr_pages, i;
-
- if (!iov->iov_base) {
- *pimu = ctx->dummy_ubuf;
- return 0;
- }
-
- ubuf = (unsigned long) iov->iov_base;
- end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ubuf >> PAGE_SHIFT;
- nr_pages = end - start;
-
- *pimu = NULL;
- ret = -ENOMEM;
-
- pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- goto done;
-
- vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- goto done;
-
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
- if (!imu)
- goto done;
-
- ret = 0;
- mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- pages, vmas);
- if (pret == nr_pages) {
- /* don't support file backed memory */
- for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma_is_shmem(vma))
- continue;
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
- ret = -EOPNOTSUPP;
- break;
- }
- }
- } else {
- ret = pret < 0 ? pret : -EFAULT;
- }
- mmap_read_unlock(current->mm);
- if (ret) {
- /*
- * if we did partial map, or found file backed vmas,
- * release any pages we did get
- */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- goto done;
- }
-
- ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
- if (ret) {
- unpin_user_pages(pages, pret);
- goto done;
- }
-
- off = ubuf & ~PAGE_MASK;
- size = iov->iov_len;
- for (i = 0; i < nr_pages; i++) {
- size_t vec_len;
-
- vec_len = min_t(size_t, size, PAGE_SIZE - off);
- imu->bvec[i].bv_page = pages[i];
- imu->bvec[i].bv_len = vec_len;
- imu->bvec[i].bv_offset = off;
- off = 0;
- size -= vec_len;
- }
- /* store original address for later verification */
- imu->ubuf = ubuf;
- imu->ubuf_end = ubuf + iov->iov_len;
- imu->nr_bvecs = nr_pages;
- *pimu = imu;
- ret = 0;
-done:
- if (ret)
- kvfree(imu);
- kvfree(pages);
- kvfree(vmas);
- return ret;
-}
-
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
- ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
- return ctx->user_bufs ? 0 : -ENOMEM;
-}
-
-static int io_buffer_validate(struct iovec *iov)
-{
- unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
-
- /*
- * Don't impose further limits on the size and buffer
- * constraints here, we'll -EINVAL later when IO is
- * submitted if they are wrong.
- */
- if (!iov->iov_base)
- return iov->iov_len ? -EFAULT : 0;
- if (!iov->iov_len)
- return -EFAULT;
-
- /* arbitrary limit, but we need something */
- if (iov->iov_len > SZ_1G)
- return -EFAULT;
-
- if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
- return -EOVERFLOW;
-
- return 0;
-}
-
-static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int nr_args, u64 __user *tags)
-{
- struct page *last_hpage = NULL;
- struct io_rsrc_data *data;
- int i, ret;
- struct iovec iov;
-
- if (ctx->user_bufs)
- return -EBUSY;
- if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
- return -EINVAL;
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- return ret;
- ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
- if (ret)
- return ret;
- ret = io_buffers_map_alloc(ctx, nr_args);
- if (ret) {
- io_rsrc_data_free(data);
- return ret;
- }
-
- for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- ret = io_copy_iov(ctx, &iov, arg, i);
- if (ret)
- break;
- ret = io_buffer_validate(&iov);
- if (ret)
- break;
- if (!iov.iov_base && *io_get_tag_slot(data, i)) {
- ret = -EINVAL;
- break;
- }
-
- ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
- &last_hpage);
- if (ret)
- break;
- }
-
- WARN_ON_ONCE(ctx->buf_data);
-
- ctx->buf_data = data;
- if (ret)
- __io_sqe_buffers_unregister(ctx);
- else
- io_rsrc_node_switch(ctx, NULL);
- return ret;
-}
-
-static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
- struct io_uring_rsrc_update2 *up,
- unsigned int nr_args)
-{
- u64 __user *tags = u64_to_user_ptr(up->tags);
- struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
- struct page *last_hpage = NULL;
- bool needs_switch = false;
- __u32 done;
- int i, err;
-
- if (!ctx->buf_data)
- return -ENXIO;
- if (up->offset + nr_args > ctx->nr_user_bufs)
- return -EINVAL;
-
- for (done = 0; done < nr_args; done++) {
- struct io_mapped_ubuf *imu;
- int offset = up->offset + done;
- u64 tag = 0;
-
- err = io_copy_iov(ctx, &iov, iovs, done);
- if (err)
- break;
- if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
- err = -EFAULT;
- break;
- }
- err = io_buffer_validate(&iov);
- if (err)
- break;
- if (!iov.iov_base && tag) {
- err = -EINVAL;
- break;
- }
- err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
- if (err)
- break;
-
- i = array_index_nospec(offset, ctx->nr_user_bufs);
- if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
- err = io_queue_rsrc_removal(ctx->buf_data, offset,
- ctx->rsrc_node, ctx->user_bufs[i]);
- if (unlikely(err)) {
- io_buffer_unmap(ctx, &imu);
- break;
- }
- ctx->user_bufs[i] = NULL;
- needs_switch = true;
- }
-
- ctx->user_bufs[i] = imu;
- *io_get_tag_slot(ctx->buf_data, offset) = tag;
- }
-
- if (needs_switch)
- io_rsrc_node_switch(ctx, ctx->buf_data);
- return done ? done : err;
-}
-
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- return 0;
-}
-
-static void io_eventfd_put(struct rcu_head *rcu)
-{
- struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
-
- eventfd_ctx_put(ev_fd->cq_ev_fd);
- kfree(ev_fd);
-}
-
-static int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- call_rcu(&ev_fd->rcu, io_eventfd_put);
- return 0;
- }
-
- return -ENXIO;
-}
-
-static void io_destroy_buffers(struct io_ring_ctx *ctx)
-{
- int i;
-
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
- struct list_head *list = &ctx->io_buffers[i];
-
- while (!list_empty(list)) {
- struct io_buffer_list *bl;
-
- bl = list_first_entry(list, struct io_buffer_list, list);
- __io_remove_buffers(ctx, bl, -1U);
- list_del(&bl->list);
- kfree(bl);
- }
- }
-
- while (!list_empty(&ctx->io_buffers_pages)) {
- struct page *page;
-
- page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
- list_del_init(&page->lru);
- __free_page(page);
- }
-}
-
-static void io_req_caches_free(struct io_ring_ctx *ctx)
-{
- struct io_submit_state *state = &ctx->submit_state;
- int nr = 0;
-
- mutex_lock(&ctx->uring_lock);
- io_flush_cached_locked_reqs(ctx, state);
-
- while (state->free_list.next) {
- struct io_wq_work_node *node;
- struct io_kiocb *req;
-
- node = wq_stack_extract(&state->free_list);
- req = container_of(node, struct io_kiocb, comp_list);
- kmem_cache_free(req_cachep, req);
- nr++;
- }
- if (nr)
- percpu_ref_put_many(&ctx->refs, nr);
- mutex_unlock(&ctx->uring_lock);
-}
-
-static void io_wait_rsrc_data(struct io_rsrc_data *data)
-{
- if (data && !atomic_dec_and_test(&data->refs))
- wait_for_completion(&data->done);
-}
-
-static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
-{
- struct async_poll *apoll;
-
- while (!list_empty(&ctx->apoll_cache)) {
- apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
- poll.wait.entry);
- list_del(&apoll->poll.wait.entry);
- kfree(apoll);
- }
-}
-
-static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
-{
- io_sq_thread_finish(ctx);
-
- if (ctx->mm_account) {
- mmdrop(ctx->mm_account);
- ctx->mm_account = NULL;
- }
-
- io_rsrc_refs_drop(ctx);
- /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
- io_wait_rsrc_data(ctx->buf_data);
- io_wait_rsrc_data(ctx->file_data);
-
- mutex_lock(&ctx->uring_lock);
- if (ctx->buf_data)
- __io_sqe_buffers_unregister(ctx);
- if (ctx->file_data)
- __io_sqe_files_unregister(ctx);
- if (ctx->rings)
- __io_cqring_overflow_flush(ctx, true);
- io_eventfd_unregister(ctx);
- io_flush_apoll_cache(ctx);
- mutex_unlock(&ctx->uring_lock);
- io_destroy_buffers(ctx);
- if (ctx->sq_creds)
- put_cred(ctx->sq_creds);
-
- /* there are no registered resources left, nobody uses it */
- if (ctx->rsrc_node)
- io_rsrc_node_destroy(ctx->rsrc_node);
- if (ctx->rsrc_backup_node)
- io_rsrc_node_destroy(ctx->rsrc_backup_node);
- flush_delayed_work(&ctx->rsrc_put_work);
- flush_delayed_work(&ctx->fallback_work);
-
- WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
- WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
-
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- ctx->ring_sock->file = NULL; /* so that iput() is called */
- sock_release(ctx->ring_sock);
- }
-#endif
- WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
-
- io_mem_free(ctx->rings);
- io_mem_free(ctx->sq_sqes);
-
- percpu_ref_exit(&ctx->refs);
- free_uid(ctx->user);
- io_req_caches_free(ctx);
- if (ctx->hash_map)
- io_wq_put_hash(ctx->hash_map);
- io_free_napi_list(ctx);
- kfree(ctx->cancel_hash);
- kfree(ctx->dummy_ubuf);
- kfree(ctx->io_buffers);
- kfree(ctx);
-}
-
-static __poll_t io_uring_poll(struct file *file, poll_table *wait)
-{
- struct io_ring_ctx *ctx = file->private_data;
- __poll_t mask = 0;
-
- poll_wait(file, &ctx->cq_wait, wait);
- /*
- * synchronizes with barrier from wq_has_sleeper call in
- * io_commit_cqring
- */
- smp_rmb();
- if (!io_sqring_full(ctx))
- mask |= EPOLLOUT | EPOLLWRNORM;
-
- /*
- * Don't flush cqring overflow list here, just do a simple check.
- * Otherwise there could possible be ABBA deadlock:
- * CPU0 CPU1
- * ---- ----
- * lock(&ctx->uring_lock);
- * lock(&ep->mtx);
- * lock(&ctx->uring_lock);
- * lock(&ep->mtx);
- *
- * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
- * pushs them to do the flush.
- */
- if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
- mask |= EPOLLIN | EPOLLRDNORM;
-
- return mask;
-}
-
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- const struct cred *creds;
-
- creds = xa_erase(&ctx->personalities, id);
- if (creds) {
- put_cred(creds);
- return 0;
- }
-
- return -EINVAL;
-}
-
-struct io_tctx_exit {
- struct callback_head task_work;
- struct completion completion;
- struct io_ring_ctx *ctx;
-};
-
-static __cold void io_tctx_exit_cb(struct callback_head *cb)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_exit *work;
-
- work = container_of(cb, struct io_tctx_exit, task_work);
- /*
- * When @in_idle, we're in cancellation and it's racy to remove the
- * node. It'll be removed by the end of cancellation, just ignore it.
- */
- if (!atomic_read(&tctx->in_idle))
- io_uring_del_tctx_node((unsigned long)work->ctx);
- complete(&work->completion);
-}
-
-static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- return req->ctx == data;
-}
-
-static __cold void io_ring_exit_work(struct work_struct *work)
-{
- struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
- unsigned long timeout = jiffies + HZ * 60 * 5;
- unsigned long interval = HZ / 20;
- struct io_tctx_exit exit;
- struct io_tctx_node *node;
- int ret;
-
- /*
- * If we're doing polled IO and end up having requests being
- * submitted async (out-of-line), then completions can come in while
- * we're waiting for refs to drop. We need to reap these manually,
- * as nobody else will be looking for them.
- */
- do {
- io_uring_try_cancel_requests(ctx, NULL, true);
- if (ctx->sq_data) {
- struct io_sq_data *sqd = ctx->sq_data;
- struct task_struct *tsk;
-
- io_sq_thread_park(sqd);
- tsk = sqd->thread;
- if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
- io_wq_cancel_cb(tsk->io_uring->io_wq,
- io_cancel_ctx_cb, ctx, true);
- io_sq_thread_unpark(sqd);
- }
-
- io_req_caches_free(ctx);
-
- if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
- /* there is little hope left, don't run it too often */
- interval = HZ * 60;
- }
- } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
-
- init_completion(&exit.completion);
- init_task_work(&exit.task_work, io_tctx_exit_cb);
- exit.ctx = ctx;
- /*
- * Some may use context even when all refs and requests have been put,
- * and they are free to do so while still holding uring_lock or
- * completion_lock, see io_req_task_submit(). Apart from other work,
- * this lock/unlock section also waits them to finish.
- */
- mutex_lock(&ctx->uring_lock);
- while (!list_empty(&ctx->tctx_list)) {
- WARN_ON_ONCE(time_after(jiffies, timeout));
-
- node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
- ctx_node);
- /* don't spin on a single task if cancellation failed */
- list_rotate_left(&ctx->tctx_list);
- ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
- if (WARN_ON_ONCE(ret))
- continue;
-
- mutex_unlock(&ctx->uring_lock);
- wait_for_completion(&exit.completion);
- mutex_lock(&ctx->uring_lock);
- }
- mutex_unlock(&ctx->uring_lock);
- spin_lock(&ctx->completion_lock);
- spin_unlock(&ctx->completion_lock);
-
- io_ring_ctx_free(ctx);
-}
-
-/* Returns true if we found and killed one or more timeouts */
-static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
- struct task_struct *tsk, bool cancel_all)
-{
- struct io_kiocb *req, *tmp;
- int canceled = 0;
-
- spin_lock(&ctx->completion_lock);
- spin_lock_irq(&ctx->timeout_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_match_task(req, tsk, cancel_all)) {
- io_kill_timeout(req, -ECANCELED);
- canceled++;
- }
- }
- spin_unlock_irq(&ctx->timeout_lock);
- if (canceled != 0)
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- if (canceled != 0)
- io_cqring_ev_posted(ctx);
- return canceled != 0;
-}
-
-static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
-{
- unsigned long index;
- struct creds *creds;
-
- mutex_lock(&ctx->uring_lock);
- percpu_ref_kill(&ctx->refs);
- if (ctx->rings)
- __io_cqring_overflow_flush(ctx, true);
- xa_for_each(&ctx->personalities, index, creds)
- io_unregister_personality(ctx, index);
- mutex_unlock(&ctx->uring_lock);
-
- io_kill_timeouts(ctx, NULL, true);
- io_poll_remove_all(ctx, NULL, true);
-
- /* if we failed setting up the ctx, we might not have any rings */
- io_iopoll_try_reap_events(ctx);
-
- INIT_WORK(&ctx->exit_work, io_ring_exit_work);
- /*
- * Use system_unbound_wq to avoid spawning tons of event kworkers
- * if we're exiting a ton of rings at the same time. It just adds
- * noise and overhead, there's no discernable change in runtime
- * over using system_wq.
- */
- queue_work(system_unbound_wq, &ctx->exit_work);
-}
-
-static int io_uring_release(struct inode *inode, struct file *file)
-{
- struct io_ring_ctx *ctx = file->private_data;
-
- file->private_data = NULL;
- io_ring_ctx_wait_and_kill(ctx);
- return 0;
-}
-
-struct io_task_cancel {
- struct task_struct *task;
- bool all;
-};
-
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_task_cancel *cancel = data;
-
- return io_match_task_safe(req, cancel->task, cancel->all);
-}
-
-static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
-{
- struct io_defer_entry *de;
- LIST_HEAD(list);
-
- spin_lock(&ctx->completion_lock);
- list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_task_safe(de->req, task, cancel_all)) {
- list_cut_position(&list, &ctx->defer_list, &de->list);
- break;
- }
- }
- spin_unlock(&ctx->completion_lock);
- if (list_empty(&list))
- return false;
-
- while (!list_empty(&list)) {
- de = list_first_entry(&list, struct io_defer_entry, list);
- list_del_init(&de->list);
- io_req_complete_failed(de->req, -ECANCELED);
- kfree(de);
- }
- return true;
-}
-
-static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
-{
- struct io_tctx_node *node;
- enum io_wq_cancel cret;
- bool ret = false;
-
- mutex_lock(&ctx->uring_lock);
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- /*
- * io_wq will stay alive while we hold uring_lock, because it's
- * killed after ctx nodes, which requires to take the lock.
- */
- if (!tctx || !tctx->io_wq)
- continue;
- cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
- ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
- }
- mutex_unlock(&ctx->uring_lock);
-
- return ret;
-}
-
-static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- bool cancel_all)
-{
- struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
- struct io_uring_task *tctx = task ? task->io_uring : NULL;
-
- while (1) {
- enum io_wq_cancel cret;
- bool ret = false;
-
- if (!task) {
- ret |= io_uring_try_cancel_iowq(ctx);
- } else if (tctx && tctx->io_wq) {
- /*
- * Cancels requests of all rings, not only @ctx, but
- * it's fine as the task is in exit/exec.
- */
- cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
- &cancel, true);
- ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
- }
-
- /* SQPOLL thread does its own polling */
- if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
- (ctx->sq_data && ctx->sq_data->thread == current)) {
- while (!wq_list_empty(&ctx->iopoll_list)) {
- io_iopoll_try_reap_events(ctx);
- ret = true;
- }
- }
-
- ret |= io_cancel_defer_files(ctx, task, cancel_all);
- ret |= io_poll_remove_all(ctx, task, cancel_all);
- ret |= io_kill_timeouts(ctx, task, cancel_all);
- if (task)
- ret |= io_run_task_work();
- if (!ret)
- break;
- cond_resched();
- }
-}
-
-static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
- int ret;
-
- if (unlikely(!tctx)) {
- ret = io_uring_alloc_task_context(current, ctx);
- if (unlikely(ret))
- return ret;
-
- tctx = current->io_uring;
- if (ctx->iowq_limits_set) {
- unsigned int limits[2] = { ctx->iowq_limits[0],
- ctx->iowq_limits[1], };
-
- ret = io_wq_max_workers(tctx->io_wq, limits);
- if (ret)
- return ret;
- }
- }
- if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
- node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
- return -ENOMEM;
- node->ctx = ctx;
- node->task = current;
-
- ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
- node, GFP_KERNEL));
- if (ret) {
- kfree(node);
- return ret;
- }
-
- mutex_lock(&ctx->uring_lock);
- list_add(&node->ctx_node, &ctx->tctx_list);
- mutex_unlock(&ctx->uring_lock);
- }
- tctx->last = ctx;
- return 0;
-}
-
-/*
- * Note that this task has used io_uring. We use it for cancelation purposes.
- */
-static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- if (likely(tctx && tctx->last == ctx))
- return 0;
- return __io_uring_add_tctx_node(ctx);
-}
-
-/*
- * Remove this io_uring_file -> task mapping.
- */
-static __cold void io_uring_del_tctx_node(unsigned long index)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
-
- if (!tctx)
- return;
- node = xa_erase(&tctx->xa, index);
- if (!node)
- return;
-
- WARN_ON_ONCE(current != node->task);
- WARN_ON_ONCE(list_empty(&node->ctx_node));
-
- mutex_lock(&node->ctx->uring_lock);
- list_del(&node->ctx_node);
- mutex_unlock(&node->ctx->uring_lock);
-
- if (tctx->last == node->ctx)
- tctx->last = NULL;
- kfree(node);
-}
-
-static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
-{
- struct io_wq *wq = tctx->io_wq;
- struct io_tctx_node *node;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, node) {
- io_uring_del_tctx_node(index);
- cond_resched();
- }
- if (wq) {
- /*
- * Must be after io_uring_del_tctx_node() (removes nodes under
- * uring_lock) to avoid race with io_uring_try_cancel_iowq().
- */
- io_wq_put_and_exit(wq);
- tctx->io_wq = NULL;
- }
-}
-
-static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
-{
- if (tracked)
- return atomic_read(&tctx->inflight_tracked);
- return percpu_counter_sum(&tctx->inflight);
-}
-
-/*
- * Find any io_uring ctx that this task has registered or done IO on, and cancel
- * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
- */
-static __cold void io_uring_cancel_generic(bool cancel_all,
- struct io_sq_data *sqd)
-{
- struct io_uring_task *tctx = current->io_uring;
- struct io_ring_ctx *ctx;
- s64 inflight;
- DEFINE_WAIT(wait);
-
- WARN_ON_ONCE(sqd && sqd->thread != current);
-
- if (!current->io_uring)
- return;
- if (tctx->io_wq)
- io_wq_exit_start(tctx->io_wq);
-
- atomic_inc(&tctx->in_idle);
- do {
- io_uring_drop_tctx_refs(current);
- /* read completions before cancelations */
- inflight = tctx_inflight(tctx, !cancel_all);
- if (!inflight)
- break;
-
- if (!sqd) {
- struct io_tctx_node *node;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, node) {
- /* sqpoll task will cancel all its requests */
- if (node->ctx->sq_data)
- continue;
- io_uring_try_cancel_requests(node->ctx, current,
- cancel_all);
- }
- } else {
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_uring_try_cancel_requests(ctx, current,
- cancel_all);
- }
-
- prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
- io_run_task_work();
- io_uring_drop_tctx_refs(current);
-
- /*
- * If we've seen completions, retry without waiting. This
- * avoids a race where a completion comes in before we did
- * prepare_to_wait().
- */
- if (inflight == tctx_inflight(tctx, !cancel_all))
- schedule();
- finish_wait(&tctx->wait, &wait);
- } while (1);
-
- io_uring_clean_tctx(tctx);
- if (cancel_all) {
- /*
- * We shouldn't run task_works after cancel, so just leave
- * ->in_idle set for normal exit.
- */
- atomic_dec(&tctx->in_idle);
- /* for exec all current's requests should be gone, kill tctx */
- __io_uring_free(current);
- }
-}
-
-void __io_uring_cancel(bool cancel_all)
-{
- io_uring_cancel_generic(cancel_all, NULL);
-}
-
-void io_uring_unreg_ringfd(void)
-{
- struct io_uring_task *tctx = current->io_uring;
- int i;
-
- for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
- if (tctx->registered_rings[i]) {
- fput(tctx->registered_rings[i]);
- tctx->registered_rings[i] = NULL;
- }
- }
-}
-
-static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
- int start, int end)
-{
- struct file *file;
- int offset;
-
- for (offset = start; offset < end; offset++) {
- offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
- if (tctx->registered_rings[offset])
- continue;
-
- file = fget(fd);
- if (!file) {
- return -EBADF;
- } else if (file->f_op != &io_uring_fops) {
- fput(file);
- return -EOPNOTSUPP;
- }
- tctx->registered_rings[offset] = file;
- return offset;
- }
-
- return -EBUSY;
-}
-
-/*
- * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
- * invocation. User passes in an array of struct io_uring_rsrc_update
- * with ->data set to the ring_fd, and ->offset given for the desired
- * index. If no index is desired, application may set ->offset == -1U
- * and we'll find an available index. Returns number of entries
- * successfully processed, or < 0 on error if none were processed.
- */
-static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
- unsigned nr_args)
-{
- struct io_uring_rsrc_update __user *arg = __arg;
- struct io_uring_rsrc_update reg;
- struct io_uring_task *tctx;
- int ret, i;
-
- if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
- return -EINVAL;
-
- mutex_unlock(&ctx->uring_lock);
- ret = io_uring_add_tctx_node(ctx);
- mutex_lock(&ctx->uring_lock);
- if (ret)
- return ret;
-
- tctx = current->io_uring;
- for (i = 0; i < nr_args; i++) {
- int start, end;
-
- if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
- ret = -EFAULT;
- break;
- }
-
- if (reg.offset == -1U) {
- start = 0;
- end = IO_RINGFD_REG_MAX;
- } else {
- if (reg.offset >= IO_RINGFD_REG_MAX) {
- ret = -EINVAL;
- break;
- }
- start = reg.offset;
- end = start + 1;
- }
-
- ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
- if (ret < 0)
- break;
-
- reg.offset = ret;
- if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
- fput(tctx->registered_rings[reg.offset]);
- tctx->registered_rings[reg.offset] = NULL;
- ret = -EFAULT;
- break;
- }
- }
-
- return i ? i : ret;
-}
-
-static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
- unsigned nr_args)
-{
- struct io_uring_rsrc_update __user *arg = __arg;
- struct io_uring_task *tctx = current->io_uring;
- struct io_uring_rsrc_update reg;
- int ret = 0, i;
-
- if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
- return -EINVAL;
- if (!tctx)
- return 0;
-
- for (i = 0; i < nr_args; i++) {
- if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
- ret = -EFAULT;
- break;
- }
- if (reg.offset >= IO_RINGFD_REG_MAX) {
- ret = -EINVAL;
- break;
- }
-
- reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
- if (tctx->registered_rings[reg.offset]) {
- fput(tctx->registered_rings[reg.offset]);
- tctx->registered_rings[reg.offset] = NULL;
- }
- }
-
- return i ? i : ret;
-}
-
-static void *io_uring_validate_mmap_request(struct file *file,
- loff_t pgoff, size_t sz)
-{
- struct io_ring_ctx *ctx = file->private_data;
- loff_t offset = pgoff << PAGE_SHIFT;
- struct page *page;
- void *ptr;
-
- switch (offset) {
- case IORING_OFF_SQ_RING:
- case IORING_OFF_CQ_RING:
- ptr = ctx->rings;
- break;
- case IORING_OFF_SQES:
- ptr = ctx->sq_sqes;
- break;
- default:
- return ERR_PTR(-EINVAL);
- }
-
- page = virt_to_head_page(ptr);
- if (sz > page_size(page))
- return ERR_PTR(-EINVAL);
-
- return ptr;
-}
-
-#ifdef CONFIG_MMU
-
-static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- size_t sz = vma->vm_end - vma->vm_start;
- unsigned long pfn;
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
- return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
-}
-
-#else /* !CONFIG_MMU */
-
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
-}
-
-static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
-{
- return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
-}
-
-static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
-{
- void *ptr;
-
- ptr = io_uring_validate_mmap_request(file, pgoff, len);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- return (unsigned long) ptr;
-}
-
-#endif /* !CONFIG_MMU */
-
-static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
-{
- DEFINE_WAIT(wait);
-
- do {
- if (!io_sqring_full(ctx))
- break;
- prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
-
- if (!io_sqring_full(ctx))
- break;
- schedule();
- } while (!signal_pending(current));
-
- finish_wait(&ctx->sqo_sq_wait, &wait);
- return 0;
-}
-
-static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
- struct __kernel_timespec __user **ts,
- const sigset_t __user **sig)
-{
- struct io_uring_getevents_arg arg;
-
- /*
- * If EXT_ARG isn't set, then we have no timespec and the argp pointer
- * is just a pointer to the sigset_t.
- */
- if (!(flags & IORING_ENTER_EXT_ARG)) {
- *sig = (const sigset_t __user *) argp;
- *ts = NULL;
- return 0;
- }
-
- /*
- * EXT_ARG is set - ensure we agree on the size of it and copy in our
- * timespec and sigset_t pointers if good.
- */
- if (*argsz != sizeof(arg))
- return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
- return -EFAULT;
- *sig = u64_to_user_ptr(arg.sigmask);
- *argsz = arg.sigmask_sz;
- *ts = u64_to_user_ptr(arg.ts);
- return 0;
-}
-
-SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- u32, min_complete, u32, flags, const void __user *, argp,
- size_t, argsz)
-{
- struct io_ring_ctx *ctx;
- int submitted = 0;
- struct fd f;
- long ret;
-
- io_run_task_work();
-
- if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
- IORING_ENTER_REGISTERED_RING)))
- return -EINVAL;
-
- /*
- * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
- * need only dereference our task private array to find it.
- */
- if (flags & IORING_ENTER_REGISTERED_RING) {
- struct io_uring_task *tctx = current->io_uring;
-
- if (!tctx || fd >= IO_RINGFD_REG_MAX)
- return -EINVAL;
- fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
- f.file = tctx->registered_rings[fd];
- if (unlikely(!f.file))
- return -EBADF;
- } else {
- f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
- }
-
- ret = -EOPNOTSUPP;
- if (unlikely(f.file->f_op != &io_uring_fops))
- goto out_fput;
-
- ret = -ENXIO;
- ctx = f.file->private_data;
- if (unlikely(!percpu_ref_tryget(&ctx->refs)))
- goto out_fput;
-
- ret = -EBADFD;
- if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
- goto out;
-
- /*
- * For SQ polling, the thread will do all submissions and completions.
- * Just return the requested submit count, and wake the thread if
- * we were asked to.
- */
- ret = 0;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- io_cqring_overflow_flush(ctx);
-
- if (unlikely(ctx->sq_data->thread == NULL)) {
- ret = -EOWNERDEAD;
- goto out;
- }
- if (flags & IORING_ENTER_SQ_WAKEUP)
- wake_up(&ctx->sq_data->wait);
- if (flags & IORING_ENTER_SQ_WAIT) {
- ret = io_sqpoll_wait_sq(ctx);
- if (ret)
- goto out;
- }
- submitted = to_submit;
- } else if (to_submit) {
- ret = io_uring_add_tctx_node(ctx);
- if (unlikely(ret))
- goto out;
- mutex_lock(&ctx->uring_lock);
- submitted = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
-
- if (submitted != to_submit)
- goto out;
- }
- if (flags & IORING_ENTER_GETEVENTS) {
- const sigset_t __user *sig;
- struct __kernel_timespec __user *ts;
-
- ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
- if (unlikely(ret))
- goto out;
-
- min_complete = min(min_complete, ctx->cq_entries);
-
- /*
- * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
- * space applications don't need to do io completion events
- * polling again, they can rely on io_sq_thread to do polling
- * work, which can reduce cpu usage and uring_lock contention.
- */
- if (ctx->flags & IORING_SETUP_IOPOLL &&
- !(ctx->flags & IORING_SETUP_SQPOLL)) {
- ret = io_iopoll_check(ctx, min_complete);
- } else {
- ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
- }
- }
-
-out:
- percpu_ref_put(&ctx->refs);
-out_fput:
- if (!(flags & IORING_ENTER_REGISTERED_RING))
- fdput(f);
- return submitted ? submitted : ret;
-}
-
-#ifdef CONFIG_PROC_FS
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
- const struct cred *cred)
-{
- struct user_namespace *uns = seq_user_ns(m);
- struct group_info *gi;
- kernel_cap_t cap;
- unsigned __capi;
- int g;
-
- seq_printf(m, "%5d\n", id);
- seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
- seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
- seq_puts(m, "\n\tGroups:\t");
- gi = cred->group_info;
- for (g = 0; g < gi->ngroups; g++) {
- seq_put_decimal_ull(m, g ? " " : "",
- from_kgid_munged(uns, gi->gid[g]));
- }
- seq_puts(m, "\n\tCapEff:\t");
- cap = cred->cap_effective;
- CAP_FOR_EACH_U32(__capi)
- seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
- seq_putc(m, '\n');
- return 0;
-}
-
-static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
- struct seq_file *m)
-{
- struct io_sq_data *sq = NULL;
- struct io_overflow_cqe *ocqe;
- struct io_rings *r = ctx->rings;
- unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
- unsigned int sq_head = READ_ONCE(r->sq.head);
- unsigned int sq_tail = READ_ONCE(r->sq.tail);
- unsigned int cq_head = READ_ONCE(r->cq.head);
- unsigned int cq_tail = READ_ONCE(r->cq.tail);
- unsigned int sq_entries, cq_entries;
- bool has_lock;
- unsigned int i;
-
- /*
- * we may get imprecise sqe and cqe info if uring is actively running
- * since we get cached_sq_head and cached_cq_tail without uring_lock
- * and sq_tail and cq_head are changed by userspace. But it's ok since
- * we usually use these info when it is stuck.
- */
- seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
- seq_printf(m, "SqHead:\t%u\n", sq_head);
- seq_printf(m, "SqTail:\t%u\n", sq_tail);
- seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
- seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
- seq_printf(m, "CqHead:\t%u\n", cq_head);
- seq_printf(m, "CqTail:\t%u\n", cq_tail);
- seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
- seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
- sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
- for (i = 0; i < sq_entries; i++) {
- unsigned int entry = i + sq_head;
- unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
- struct io_uring_sqe *sqe;
-
- if (sq_idx > sq_mask)
- continue;
- sqe = &ctx->sq_sqes[sq_idx];
- seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
- sq_idx, sqe->opcode, sqe->fd, sqe->flags,
- sqe->user_data);
- }
- seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
- cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
- for (i = 0; i < cq_entries; i++) {
- unsigned int entry = i + cq_head;
- struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
-
- seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
- entry & cq_mask, cqe->user_data, cqe->res,
- cqe->flags);
- }
-
- /*
- * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
- * since fdinfo case grabs it in the opposite direction of normal use
- * cases. If we fail to get the lock, we just don't iterate any
- * structures that could be going away outside the io_uring mutex.
- */
- has_lock = mutex_trylock(&ctx->uring_lock);
-
- if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
- sq = ctx->sq_data;
- if (!sq->thread)
- sq = NULL;
- }
-
- seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
- seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
- seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
- for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
- struct file *f = io_file_from_index(ctx, i);
-
- if (f)
- seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
- else
- seq_printf(m, "%5u: <none>\n", i);
- }
- seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
- for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *buf = ctx->user_bufs[i];
- unsigned int len = buf->ubuf_end - buf->ubuf;
-
- seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
- }
- if (has_lock && !xa_empty(&ctx->personalities)) {
- unsigned long index;
- const struct cred *cred;
-
- seq_printf(m, "Personalities:\n");
- xa_for_each(&ctx->personalities, index, cred)
- io_uring_show_cred(m, index, cred);
- }
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
-
- seq_puts(m, "PollList:\n");
- spin_lock(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list = &ctx->cancel_hash[i];
- struct io_kiocb *req;
-
- hlist_for_each_entry(req, list, hash_node)
- seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
- task_work_pending(req->task));
- }
-
- seq_puts(m, "CqOverflowList:\n");
- list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
- struct io_uring_cqe *cqe = &ocqe->cqe;
-
- seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
- cqe->user_data, cqe->res, cqe->flags);
-
- }
-
- spin_unlock(&ctx->completion_lock);
-}
-
-static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
-{
- struct io_ring_ctx *ctx = f->private_data;
-
- if (percpu_ref_tryget(&ctx->refs)) {
- __io_uring_show_fdinfo(ctx, m);
- percpu_ref_put(&ctx->refs);
- }
-}
-#endif
-
-static const struct file_operations io_uring_fops = {
- .release = io_uring_release,
- .mmap = io_uring_mmap,
-#ifndef CONFIG_MMU
- .get_unmapped_area = io_uring_nommu_get_unmapped_area,
- .mmap_capabilities = io_uring_nommu_mmap_capabilities,
-#endif
- .poll = io_uring_poll,
-#ifdef CONFIG_PROC_FS
- .show_fdinfo = io_uring_show_fdinfo,
-#endif
-};
-
-static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
- struct io_uring_params *p)
-{
- struct io_rings *rings;
- size_t size, sq_array_offset;
-
- /* make sure these are sane, as we already accounted them */
- ctx->sq_entries = p->sq_entries;
- ctx->cq_entries = p->cq_entries;
-
- size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- rings = io_mem_alloc(size);
- if (!rings)
- return -ENOMEM;
-
- ctx->rings = rings;
- ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
- rings->sq_ring_mask = p->sq_entries - 1;
- rings->cq_ring_mask = p->cq_entries - 1;
- rings->sq_ring_entries = p->sq_entries;
- rings->cq_ring_entries = p->cq_entries;
-
- size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
- if (size == SIZE_MAX) {
- io_mem_free(ctx->rings);
- ctx->rings = NULL;
- return -EOVERFLOW;
- }
-
- ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes) {
- io_mem_free(ctx->rings);
- ctx->rings = NULL;
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
-{
- int ret, fd;
-
- fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- ret = io_uring_add_tctx_node(ctx);
- if (ret) {
- put_unused_fd(fd);
- return ret;
- }
- fd_install(fd, file);
- return fd;
-}
-
-/*
- * Allocate an anonymous fd, this is what constitutes the application
- * visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
- */
-static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
-{
- struct file *file;
-#if defined(CONFIG_UNIX)
- int ret;
-
- ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
- &ctx->ring_sock);
- if (ret)
- return ERR_PTR(ret);
-#endif
-
- file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
- O_RDWR | O_CLOEXEC, NULL);
-#if defined(CONFIG_UNIX)
- if (IS_ERR(file)) {
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
- } else {
- ctx->ring_sock->file = file;
- }
-#endif
- return file;
-}
-
-static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
-{
- struct io_ring_ctx *ctx;
- struct file *file;
- int ret;
-
- if (!entries)
- return -EINVAL;
- if (entries > IORING_MAX_ENTRIES) {
- if (!(p->flags & IORING_SETUP_CLAMP))
- return -EINVAL;
- entries = IORING_MAX_ENTRIES;
- }
-
- /*
- * Use twice as many entries for the CQ ring. It's possible for the
- * application to drive a higher depth than the size of the SQ ring,
- * since the sqes are only used at submission time. This allows for
- * some flexibility in overcommitting a bit. If the application has
- * set IORING_SETUP_CQSIZE, it will have passed in the desired number
- * of CQ ring entries manually.
- */
- p->sq_entries = roundup_pow_of_two(entries);
- if (p->flags & IORING_SETUP_CQSIZE) {
- /*
- * If IORING_SETUP_CQSIZE is set, we do the same roundup
- * to a power-of-two, if it isn't already. We do NOT impose
- * any cq vs sq ring sizing.
- */
- if (!p->cq_entries)
- return -EINVAL;
- if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
- if (!(p->flags & IORING_SETUP_CLAMP))
- return -EINVAL;
- p->cq_entries = IORING_MAX_CQ_ENTRIES;
- }
- p->cq_entries = roundup_pow_of_two(p->cq_entries);
- if (p->cq_entries < p->sq_entries)
- return -EINVAL;
- } else {
- p->cq_entries = 2 * p->sq_entries;
- }
-
- ctx = io_ring_ctx_alloc(p);
- if (!ctx)
- return -ENOMEM;
- ctx->compat = in_compat_syscall();
- if (!capable(CAP_IPC_LOCK))
- ctx->user = get_uid(current_user());
-
- /*
- * This is just grabbed for accounting purposes. When a process exits,
- * the mm is exited and dropped before the files, hence we need to hang
- * on to this mm purely for the purposes of being able to unaccount
- * memory (locked/pinned vm). It's not used for anything else.
- */
- mmgrab(current->mm);
- ctx->mm_account = current->mm;
-
- ret = io_allocate_scq_urings(ctx, p);
- if (ret)
- goto err;
-
- ret = io_sq_offload_create(ctx, p);
- if (ret)
- goto err;
- /* always set a rsrc node */
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- goto err;
- io_rsrc_node_switch(ctx, NULL);
-
- memset(&p->sq_off, 0, sizeof(p->sq_off));
- p->sq_off.head = offsetof(struct io_rings, sq.head);
- p->sq_off.tail = offsetof(struct io_rings, sq.tail);
- p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
- p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
- p->sq_off.flags = offsetof(struct io_rings, sq_flags);
- p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
- p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
-
- memset(&p->cq_off, 0, sizeof(p->cq_off));
- p->cq_off.head = offsetof(struct io_rings, cq.head);
- p->cq_off.tail = offsetof(struct io_rings, cq.tail);
- p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
- p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
- p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
- p->cq_off.cqes = offsetof(struct io_rings, cqes);
- p->cq_off.flags = offsetof(struct io_rings, cq_flags);
-
- p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
- IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP;
-
- if (copy_to_user(params, p, sizeof(*p))) {
- ret = -EFAULT;
- goto err;
- }
-
- file = io_uring_get_file(ctx);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto err;
- }
-
- /*
- * Install ring fd as the very last thing, so we don't risk someone
- * having closed it before we finish setup
- */
- ret = io_uring_install_fd(ctx, file);
- if (ret < 0) {
- /* fput will clean it up */
- fput(file);
- return ret;
- }
-
- trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
- return ret;
-err:
- io_ring_ctx_wait_and_kill(ctx);
- return ret;
-}
-
-/*
- * Sets up an aio uring context, and returns the fd. Applications asks for a
- * ring size, we return the actual sq/cq ring sizes (among other things) in the
- * params structure passed in.
- */
-static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
-{
- struct io_uring_params p;
- int i;
-
- if (copy_from_user(&p, params, sizeof(p)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
- if (p.resv[i])
- return -EINVAL;
- }
-
- if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
- IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
- IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
- return -EINVAL;
-
- return io_uring_create(entries, &p, params);
-}
-
-SYSCALL_DEFINE2(io_uring_setup, u32, entries,
- struct io_uring_params __user *, params)
-{
- return io_uring_setup(entries, params);
-}
-
-static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_probe *p;
- size_t size;
- int i, ret;
-
- size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
- p = kzalloc(size, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(p, arg, size))
- goto out;
- ret = -EINVAL;
- if (memchr_inv(p, 0, size))
- goto out;
-
- p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
-
- for (i = 0; i < nr_args; i++) {
- p->ops[i].op = i;
- if (!io_op_defs[i].not_supported)
- p->ops[i].flags = IO_URING_OP_SUPPORTED;
- }
- p->ops_len = i;
-
- ret = 0;
- if (copy_to_user(arg, p, size))
- ret = -EFAULT;
-out:
- kfree(p);
- return ret;
-}
-
-static int io_register_personality(struct io_ring_ctx *ctx)
-{
- const struct cred *creds;
- u32 id;
- int ret;
-
- creds = get_current_cred();
-
- ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
- XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
- if (ret < 0) {
- put_cred(creds);
- return ret;
- }
- return id;
-}
-
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
-{
- struct io_uring_restriction *res;
- size_t size;
- int i, ret;
-
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
- if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
- return -EINVAL;
-
- size = array_size(nr_args, sizeof(*res));
- if (size == SIZE_MAX)
- return -EOVERFLOW;
-
- res = memdup_user(arg, size);
- if (IS_ERR(res))
- return PTR_ERR(res);
-
- ret = 0;
-
- for (i = 0; i < nr_args; i++) {
- switch (res[i].opcode) {
- case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
- break;
- case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
- break;
- case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
- break;
- case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
- break;
- default:
- ret = -EINVAL;
- goto out;
- }
- }
-
-out:
- /* Reset all restrictions if an error happened */
- if (ret != 0)
- memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
- else
- ctx->restrictions.registered = true;
-
- kfree(res);
- return ret;
-}
-
-static int io_register_enable_rings(struct io_ring_ctx *ctx)
-{
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- if (ctx->restrictions.registered)
- ctx->restricted = 1;
-
- ctx->flags &= ~IORING_SETUP_R_DISABLED;
- if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
- return 0;
-}
-
-static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
- struct io_uring_rsrc_update2 *up,
- unsigned nr_args)
-{
- __u32 tmp;
- int err;
-
- if (up->resv)
- return -EINVAL;
- if (check_add_overflow(up->offset, nr_args, &tmp))
- return -EOVERFLOW;
- err = io_rsrc_node_switch_start(ctx);
- if (err)
- return err;
-
- switch (type) {
- case IORING_RSRC_FILE:
- return __io_sqe_files_update(ctx, up, nr_args);
- case IORING_RSRC_BUFFER:
- return __io_sqe_buffers_update(ctx, up, nr_args);
- }
- return -EINVAL;
-}
-
-static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
-{
- struct io_uring_rsrc_update2 up;
-
- if (!nr_args)
- return -EINVAL;
- memset(&up, 0, sizeof(up));
- if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
- return -EFAULT;
- return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
-}
-
-static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
- unsigned size, unsigned type)
-{
- struct io_uring_rsrc_update2 up;
-
- if (size != sizeof(up))
- return -EINVAL;
- if (copy_from_user(&up, arg, sizeof(up)))
- return -EFAULT;
- if (!up.nr || up.resv)
- return -EINVAL;
- return __io_register_rsrc_update(ctx, type, &up, up.nr);
-}
-
-static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int size, unsigned int type)
-{
- struct io_uring_rsrc_register rr;
-
- /* keep it extendible */
- if (size != sizeof(rr))
- return -EINVAL;
-
- memset(&rr, 0, sizeof(rr));
- if (copy_from_user(&rr, arg, size))
- return -EFAULT;
- if (!rr.nr || rr.resv || rr.resv2)
- return -EINVAL;
-
- switch (type) {
- case IORING_RSRC_FILE:
- return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
- rr.nr, u64_to_user_ptr(rr.tags));
- case IORING_RSRC_BUFFER:
- return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
- rr.nr, u64_to_user_ptr(rr.tags));
- }
- return -EINVAL;
-}
-
-static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
- void __user *arg, unsigned len)
-{
- struct io_uring_task *tctx = current->io_uring;
- cpumask_var_t new_mask;
- int ret;
-
- if (!tctx || !tctx->io_wq)
- return -EINVAL;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_clear(new_mask);
- if (len > cpumask_size())
- len = cpumask_size();
-
- if (copy_from_user(new_mask, arg, len)) {
- free_cpumask_var(new_mask);
- return -EFAULT;
- }
-
- ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
- free_cpumask_var(new_mask);
- return ret;
-}
-
-static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
-{
- struct io_uring_task *tctx = current->io_uring;
-
- if (!tctx || !tctx->io_wq)
- return -EINVAL;
-
- return io_wq_cpu_affinity(tctx->io_wq, NULL);
-}
-
-static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
- void __user *arg)
- __must_hold(&ctx->uring_lock)
-{
- struct io_tctx_node *node;
- struct io_uring_task *tctx = NULL;
- struct io_sq_data *sqd = NULL;
- __u32 new_count[2];
- int i, ret;
-
- if (copy_from_user(new_count, arg, sizeof(new_count)))
- return -EFAULT;
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i] > INT_MAX)
- return -EINVAL;
-
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- sqd = ctx->sq_data;
- if (sqd) {
- /*
- * Observe the correct sqd->lock -> ctx->uring_lock
- * ordering. Fine to drop uring_lock here, we hold
- * a ref to the ctx.
- */
- refcount_inc(&sqd->refs);
- mutex_unlock(&ctx->uring_lock);
- mutex_lock(&sqd->lock);
- mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
- }
- } else {
- tctx = current->io_uring;
- }
-
- BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- if (new_count[i])
- ctx->iowq_limits[i] = new_count[i];
- ctx->iowq_limits_set = true;
-
- if (tctx && tctx->io_wq) {
- ret = io_wq_max_workers(tctx->io_wq, new_count);
- if (ret)
- goto err;
- } else {
- memset(new_count, 0, sizeof(new_count));
- }
-
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
-
- if (copy_to_user(arg, new_count, sizeof(new_count)))
- return -EFAULT;
-
- /* that's it for SQPOLL, only the SQPOLL task creates requests */
- if (sqd)
- return 0;
-
- /* now propagate the restriction to all registered users */
- list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
- struct io_uring_task *tctx = node->task->io_uring;
-
- if (WARN_ON_ONCE(!tctx->io_wq))
- continue;
-
- for (i = 0; i < ARRAY_SIZE(new_count); i++)
- new_count[i] = ctx->iowq_limits[i];
- /* ignore errors, it always returns zero anyway */
- (void)io_wq_max_workers(tctx->io_wq, new_count);
- }
- return 0;
-err:
- if (sqd) {
- mutex_unlock(&sqd->lock);
- io_put_sq_data(sqd);
- }
- return ret;
-}
-
-static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
- void __user *arg, unsigned nr_args)
- __releases(ctx->uring_lock)
- __acquires(ctx->uring_lock)
-{
- int ret;
-
- /*
- * We're inside the ring mutex, if the ref is already dying, then
- * someone else killed the ctx or is already going through
- * io_uring_register().
- */
- if (percpu_ref_is_dying(&ctx->refs))
- return -ENXIO;
-
- if (ctx->restricted) {
- if (opcode >= IORING_REGISTER_LAST)
- return -EINVAL;
- opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
- if (!test_bit(opcode, ctx->restrictions.register_op))
- return -EACCES;
- }
-
- switch (opcode) {
- case IORING_REGISTER_BUFFERS:
- ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_BUFFERS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_buffers_unregister(ctx);
- break;
- case IORING_REGISTER_FILES:
- ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
- break;
- case IORING_UNREGISTER_FILES:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_sqe_files_unregister(ctx);
- break;
- case IORING_REGISTER_FILES_UPDATE:
- ret = io_register_files_update(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_EVENTFD:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 0);
- break;
- case IORING_REGISTER_EVENTFD_ASYNC:
- ret = -EINVAL;
- if (nr_args != 1)
- break;
- ret = io_eventfd_register(ctx, arg, 1);
- break;
- case IORING_UNREGISTER_EVENTFD:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_eventfd_unregister(ctx);
- break;
- case IORING_REGISTER_PROBE:
- ret = -EINVAL;
- if (!arg || nr_args > 256)
- break;
- ret = io_probe(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_personality(ctx);
- break;
- case IORING_UNREGISTER_PERSONALITY:
- ret = -EINVAL;
- if (arg)
- break;
- ret = io_unregister_personality(ctx, nr_args);
- break;
- case IORING_REGISTER_ENABLE_RINGS:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_register_enable_rings(ctx);
- break;
- case IORING_REGISTER_RESTRICTIONS:
- ret = io_register_restrictions(ctx, arg, nr_args);
- break;
- case IORING_REGISTER_FILES2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_FILES_UPDATE2:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_FILE);
- break;
- case IORING_REGISTER_BUFFERS2:
- ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_BUFFERS_UPDATE:
- ret = io_register_rsrc_update(ctx, arg, nr_args,
- IORING_RSRC_BUFFER);
- break;
- case IORING_REGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (!arg || !nr_args)
- break;
- ret = io_register_iowq_aff(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_IOWQ_AFF:
- ret = -EINVAL;
- if (arg || nr_args)
- break;
- ret = io_unregister_iowq_aff(ctx);
- break;
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- ret = -EINVAL;
- if (!arg || nr_args != 2)
- break;
- ret = io_register_iowq_max_workers(ctx, arg);
- break;
- case IORING_REGISTER_RING_FDS:
- ret = io_ringfd_register(ctx, arg, nr_args);
- break;
- case IORING_UNREGISTER_RING_FDS:
- ret = io_ringfd_unregister(ctx, arg, nr_args);
- break;
- default:
- ret = -EINVAL;
- break;
- }
-
- return ret;
-}
-
-SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
- void __user *, arg, unsigned int, nr_args)
-{
- struct io_ring_ctx *ctx;
- long ret = -EBADF;
- struct fd f;
-
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
-
- ret = -EOPNOTSUPP;
- if (f.file->f_op != &io_uring_fops)
- goto out_fput;
-
- ctx = f.file->private_data;
-
- io_run_task_work();
-
- mutex_lock(&ctx->uring_lock);
- ret = __io_uring_register(ctx, opcode, arg, nr_args);
- mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
-out_fput:
- fdput(f);
- return ret;
-}
-
-static int __init io_uring_init(void)
-{
-#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
- BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
- BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
-} while (0)
-
-#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
- __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
- BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
- BUILD_BUG_SQE_ELEM(0, __u8, opcode);
- BUILD_BUG_SQE_ELEM(1, __u8, flags);
- BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
- BUILD_BUG_SQE_ELEM(4, __s32, fd);
- BUILD_BUG_SQE_ELEM(8, __u64, off);
- BUILD_BUG_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(24, __u32, len);
- BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(44, __u32, file_index);
-
- BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
- sizeof(struct io_uring_rsrc_update));
- BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
- sizeof(struct io_uring_rsrc_update2));
-
- /* ->buf_index is u16 */
- BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
-
- /* should fit into one byte */
- BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
- BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
- BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
-
- BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
- BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
-
- req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
- SLAB_ACCOUNT);
- return 0;
-};
-__initcall(io_uring_init);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8ce8720093b9..91ee0b308e13 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
static struct bio_set iomap_ioend_bioset;
static struct iomap_page *
-iomap_page_create(struct inode *inode, struct folio *folio)
+iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
{
struct iomap_page *iop = to_iomap_page(folio);
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+ gfp_t gfp;
if (iop || nr_blocks <= 1)
return iop;
+ if (flags & IOMAP_NOWAIT)
+ gfp = GFP_NOWAIT;
+ else
+ gfp = GFP_NOFS | __GFP_NOFAIL;
+
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
- GFP_NOFS | __GFP_NOFAIL);
- spin_lock_init(&iop->uptodate_lock);
- if (folio_test_uptodate(folio))
- bitmap_fill(iop->uptodate, nr_blocks);
- folio_attach_private(folio, iop);
+ gfp);
+ if (iop) {
+ spin_lock_init(&iop->uptodate_lock);
+ if (folio_test_uptodate(folio))
+ bitmap_fill(iop->uptodate, nr_blocks);
+ folio_attach_private(folio, iop);
+ }
return iop;
}
@@ -154,9 +162,6 @@ static void iomap_iop_set_range_uptodate(struct folio *folio,
static void iomap_set_range_uptodate(struct folio *folio,
struct iomap_page *iop, size_t off, size_t len)
{
- if (folio_test_error(folio))
- return;
-
if (iop)
iomap_iop_set_range_uptodate(folio, iop, off, len);
else
@@ -226,7 +231,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
- iop = iomap_page_create(iter->inode, folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
else
iop = to_iomap_page(folio);
@@ -264,7 +269,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */
- iop = iomap_page_create(iter->inode, folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
@@ -297,7 +302,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
/*
* If the bio_alloc fails, try it again for a single page to
* avoid having to deal with partial page reads. This emulates
- * what do_mpage_readpage does.
+ * what do_mpage_read_folio does.
*/
if (!ctx->bio) {
ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
@@ -320,10 +325,8 @@ done:
return pos - orig_pos + plen;
}
-int
-iomap_readpage(struct page *page, const struct iomap_ops *ops)
+int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
{
- struct folio *folio = page_folio(page);
struct iomap_iter iter = {
.inode = folio->mapping->host,
.pos = folio_pos(folio),
@@ -351,13 +354,13 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
}
/*
- * Just like mpage_readahead and block_read_full_page, we always
- * return 0 and just mark the page as PageError on errors. This
+ * Just like mpage_readahead and block_read_full_folio, we always
+ * return 0 and just set the folio error flag on errors. This
* should be cleaned up throughout the stack eventually.
*/
return 0;
}
-EXPORT_SYMBOL_GPL(iomap_readpage);
+EXPORT_SYMBOL_GPL(iomap_read_folio);
static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
@@ -454,25 +457,23 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
-int
-iomap_releasepage(struct page *page, gfp_t gfp_mask)
+bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- struct folio *folio = page_folio(page);
-
- trace_iomap_releasepage(folio->mapping->host, folio_pos(folio),
+ trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
folio_size(folio));
/*
- * mm accommodates an old ext3 case where clean pages might not have had
- * the dirty bit cleared. Thus, it can send actual dirty pages to
- * ->releasepage() via shrink_active_list(); skip those here.
+ * mm accommodates an old ext3 case where clean folios might
+ * not have had the dirty bit cleared. Thus, it can send actual
+ * dirty folios to ->release_folio() via shrink_active_list();
+ * skip those here.
*/
if (folio_test_dirty(folio) || folio_test_writeback(folio))
- return 0;
+ return false;
iomap_page_release(folio);
- return 1;
+ return true;
}
-EXPORT_SYMBOL_GPL(iomap_releasepage);
+EXPORT_SYMBOL_GPL(iomap_release_folio);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
{
@@ -496,31 +497,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
}
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
-#ifdef CONFIG_MIGRATION
-int
-iomap_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
-{
- struct folio *folio = page_folio(page);
- struct folio *newfolio = page_folio(newpage);
- int ret;
-
- ret = folio_migrate_mapping(mapping, newfolio, folio, 0);
- if (ret != MIGRATEPAGE_SUCCESS)
- return ret;
-
- if (folio_test_private(folio))
- folio_attach_private(newfolio, folio_detach_private(folio));
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(newfolio, folio);
- else
- folio_migrate_flags(newfolio, folio);
- return MIGRATEPAGE_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(iomap_migrate_page);
-#endif /* CONFIG_MIGRATION */
-
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -531,7 +507,8 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
* write started inside the existing inode size.
*/
if (pos + len > i_size)
- truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+ truncate_pagecache_range(inode, max(pos, i_size),
+ pos + len - 1);
}
static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
@@ -550,10 +527,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct iomap_page *iop = iomap_page_create(iter->inode, folio);
+ struct iomap_page *iop;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
+ unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
@@ -561,6 +539,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0;
folio_clear_error(folio);
+ iop = iomap_page_create(iter->inode, folio, iter->flags);
+ if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
+ return -EAGAIN;
+
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
@@ -577,7 +559,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return -EIO;
folio_zero_segments(folio, poff, from, to, poff + plen);
} else {
- int status = iomap_read_folio_sync(block_start, folio,
+ int status;
+
+ if (iter->flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ status = iomap_read_folio_sync(block_start, folio,
poff, plen, srcmap);
if (status)
return status;
@@ -606,6 +593,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
+ if (iter->flags & IOMAP_NOWAIT)
+ fgp |= FGP_NOWAIT;
+
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -625,7 +615,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
if (!folio) {
- status = -ENOMEM;
+ status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page;
}
if (pos + len > folio_pos(folio) + folio_size(folio))
@@ -663,10 +653,10 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
/*
* The blocks that were entirely written will now be uptodate, so we
- * don't have to worry about a readpage reading them and overwriting a
+ * don't have to worry about a read_folio reading them and overwriting a
* partial write. However, if we've encountered a short write and only
* partially written into a block, it will not be marked uptodate, so a
- * readpage might come in and destroy our partial write.
+ * read_folio might come in and destroy our partial write.
*
* Do the simplest thing and just treat any short write to a
* non-uptodate page as a zero-length write, and force the caller to
@@ -733,7 +723,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
folio_put(folio);
if (ret < len)
- iomap_write_failed(iter->inode, pos, len);
+ iomap_write_failed(iter->inode, pos + ret, len - ret);
return ret;
}
@@ -743,6 +733,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t pos = iter->pos;
ssize_t written = 0;
long status = 0;
+ struct address_space *mapping = iter->inode->i_mapping;
+ unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do {
struct folio *folio;
@@ -755,6 +747,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
+ status = balance_dirty_pages_ratelimited_flags(mapping,
+ bdp_flags);
+ if (unlikely(status))
+ break;
+
if (bytes > length)
bytes = length;
@@ -763,6 +760,10 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
+ *
+ * For async buffered writes the assumption is that the user
+ * page has already been faulted in. This can be optimized by
+ * faulting the user page.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
@@ -774,7 +775,7 @@ again:
break;
page = folio_file_page(folio, pos >> PAGE_SHIFT);
- if (mapping_writably_mapped(iter->inode->i_mapping))
+ if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
@@ -799,10 +800,12 @@ again:
pos += status;
written += status;
length -= status;
-
- balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (iov_iter_count(i) && length);
+ if (status == -EAGAIN) {
+ iov_iter_revert(i, written);
+ return -EAGAIN;
+ }
return written ? written : status;
}
@@ -818,6 +821,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
};
int ret;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ iter.flags |= IOMAP_NOWAIT;
+
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
if (iter.pos == iocb->ki_pos)
@@ -920,10 +926,10 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
pos += bytes;
length -= bytes;
written += bytes;
- if (did_zero)
- *did_zero = true;
} while (length > 0);
+ if (did_zero)
+ *did_zero = true;
return written;
}
@@ -1332,7 +1338,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos)
{
- struct iomap_page *iop = iomap_page_create(inode, folio);
+ struct iomap_page *iop = iomap_page_create(inode, folio, 0);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1354,6 +1360,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
error = wpc->ops->map_blocks(wpc, inode, pos);
if (error)
break;
+ trace_iomap_writepage_map(inode, &wpc->iomap);
if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
continue;
if (wpc->iomap.type == IOMAP_HOLE)
@@ -1386,7 +1393,6 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (wpc->ops->discard_folio)
wpc->ops->discard_folio(folio, pos);
if (!count) {
- folio_clear_uptodate(folio);
folio_unlock(folio);
goto done;
}
@@ -1416,7 +1422,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (!count)
folio_end_writeback(folio);
done:
- mapping_set_error(folio->mapping, error);
+ mapping_set_error(inode->i_mapping, error);
return error;
}
@@ -1482,10 +1488,10 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
pgoff_t end_index = isize >> PAGE_SHIFT;
/*
- * Skip the page if it's fully outside i_size, e.g. due to a
- * truncate operation that's in progress. We must redirty the
- * page so that reclaim stops reclaiming it. Otherwise
- * iomap_vm_releasepage() is called on it and gets confused.
+ * Skip the page if it's fully outside i_size, e.g.
+ * due to a truncate operation that's in progress. We've
+ * cleaned this page and truncate will finish things off for
+ * us.
*
* Note that the end_index is unsigned long. If the given
* offset is greater than 16TB on a 32-bit system then if we
@@ -1500,7 +1506,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
*/
if (folio->index > end_index ||
(folio->index == end_index && poff == 0))
- goto redirty;
+ goto unlock;
/*
* The page straddles i_size. It must be zeroed out on each
@@ -1518,26 +1524,12 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
redirty:
folio_redirty_for_writepage(wbc, folio);
+unlock:
folio_unlock(folio);
return 0;
}
int
-iomap_writepage(struct page *page, struct writeback_control *wbc,
- struct iomap_writepage_ctx *wpc,
- const struct iomap_writeback_ops *ops)
-{
- int ret;
-
- wpc->ops = ops;
- ret = iomap_do_writepage(page, wbc, wpc);
- if (!wpc->ioend)
- return ret;
- return iomap_submit_ioend(wpc, wpc->ioend, ret);
-}
-EXPORT_SYMBOL_GPL(iomap_writepage);
-
-int
iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b08f5dc31780..4eb559a16c9e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -51,12 +51,22 @@ struct iomap_dio {
};
};
+static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
+ struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf)
+{
+ if (dio->dops && dio->dops->bio_set)
+ return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf,
+ GFP_KERNEL, dio->dops->bio_set);
+ return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
+}
+
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
- if (dio->iocb->ki_flags & IOCB_HIPRI) {
+ /* Sync dio can't be polled reliably */
+ if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
bio_set_polled(bio, dio->iocb);
dio->submit.poll_bio = bio;
}
@@ -144,7 +154,7 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-static void iomap_dio_bio_end_io(struct bio *bio)
+void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
@@ -176,16 +186,16 @@ static void iomap_dio_bio_end_io(struct bio *bio)
bio_put(bio);
}
}
+EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
- int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
- bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+ bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
@@ -202,10 +212,10 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* mapping, and whether or not we want FUA. Note that we can end up
* clearing the WRITE_FUA flag in the dio request.
*/
-static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
+static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
{
- unsigned int opflags = REQ_SYNC | REQ_IDLE;
+ blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
if (!(dio->flags & IOMAP_DIO_WRITE)) {
WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
@@ -232,10 +242,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct inode *inode = iter->inode;
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
- unsigned int align = iov_iter_alignment(dio->submit.iter);
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
- unsigned int bio_opf;
+ blk_opf_t bio_opf;
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
@@ -243,7 +252,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
- if ((pos | length | align) & ((1 << blkbits) - 1))
+ if ((pos | length) & ((1 << blkbits) - 1) ||
+ !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
if (iomap->type == IOMAP_UNWRITTEN) {
@@ -265,8 +275,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* cache flushes on IO completion.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) &&
- blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
use_fua = true;
}
@@ -311,7 +320,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+ bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
@@ -474,7 +483,7 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before)
+ unsigned int dio_flags, void *private, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -483,6 +492,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
.flags = IOMAP_DIRECT,
+ .private = private,
};
loff_t end = iomi.pos + iomi.len - 1, ret = 0;
bool wait_for_completion =
@@ -523,7 +533,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_NOWAIT;
}
- if (iter_is_iovec(iter))
+ if (user_backed_iter(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
iomi.flags |= IOMAP_WRITE;
@@ -538,17 +548,18 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
/* for data sync or sync, we need sync completion processing */
- if (iocb->ki_flags & IOCB_DSYNC)
+ if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;
- /*
- * For datasync only writes, we optimistically try using FUA for
- * this IO. Any non-FUA write that occurs will clear this flag,
- * hence we know before completion whether a cache flush is
- * necessary.
- */
- if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
- dio->flags |= IOMAP_DIO_WRITE_FUA;
+ /*
+ * For datasync only writes, we optimistically try
+ * using FUA for this IO. Any non-FUA write that
+ * occurs will clear this flag, hence we know before
+ * completion whether a cache flush is necessary.
+ */
+ if (!(iocb->ki_flags & IOCB_SYNC))
+ dio->flags |= IOMAP_DIO_WRITE_FUA;
+ }
}
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
@@ -654,9 +665,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(dio->submit.waiter))
break;
- if (!dio->submit.poll_bio ||
- !bio_poll(dio->submit.poll_bio, NULL, 0))
- blk_io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
}
@@ -674,11 +683,12 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- unsigned int dio_flags, size_t done_before)
+ unsigned int dio_flags, void *private, size_t done_before)
{
struct iomap_dio *dio;
- dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
+ done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index a6689a563c6e..f6ea9540d082 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -80,7 +80,7 @@ DEFINE_EVENT(iomap_range_class, name, \
TP_PROTO(struct inode *inode, loff_t off, u64 len),\
TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
-DEFINE_RANGE_EVENT(iomap_releasepage);
+DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
@@ -148,6 +148,7 @@ DEFINE_EVENT(iomap_class, name, \
TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
+DEFINE_IOMAP_EVENT(iomap_writepage_map);
TRACE_EVENT(iomap_iter,
TP_PROTO(struct iomap_iter *iter, const void *ops,
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index bc12ac7e2312..b466172eec25 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -82,7 +82,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
return 0;
}
haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks);
- ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs);
+ ll_rw_block(REQ_OP_READ, haveblocks, bhs);
curbh = 0;
curpage = 0;
@@ -296,8 +296,9 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
* per reference. We inject the additional pages into the page
* cache as a form of readahead.
*/
-static int zisofs_readpage(struct file *file, struct page *page)
+static int zisofs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
int err;
@@ -369,7 +370,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
}
const struct address_space_operations zisofs_aops = {
- .readpage = zisofs_readpage,
+ .read_folio = zisofs_read_folio,
/* No bmap operation supported */
};
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d7491692aea3..df9d70588b60 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1174,9 +1174,9 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
return sb_bread(inode->i_sb, blknr);
}
-static int isofs_readpage(struct file *file, struct page *page)
+static int isofs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, isofs_get_block);
+ return mpage_read_folio(folio, isofs_get_block);
}
static void isofs_readahead(struct readahead_control *rac)
@@ -1190,7 +1190,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations isofs_aops = {
- .readpage = isofs_readpage,
+ .read_folio = isofs_read_folio,
.readahead = isofs_readahead,
.bmap = _isofs_bmap
};
@@ -1277,13 +1277,11 @@ static int isofs_read_level3_size(struct inode *inode)
} while (more_entries);
out:
kfree(tmpde);
- if (bh)
- brelse(bh);
+ brelse(bh);
return 0;
out_nomem:
- if (bh)
- brelse(bh);
+ brelse(bh);
return -ENOMEM;
out_noread:
@@ -1486,8 +1484,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
ret = 0;
out:
kfree(tmpde);
- if (bh)
- brelse(bh);
+ brelse(bh);
return ret;
out_badread:
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 4880146babaf..48f58c6c9e69 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -687,11 +687,12 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
}
/*
- * readpage() for symlinks: reads symlink contents into the page and either
+ * read_folio() for symlinks: reads symlink contents into the folio and either
* makes it uptodate and returns 0 or returns error (-EIO)
*/
-static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
+static int rock_ridge_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
@@ -804,5 +805,5 @@ error:
}
const struct address_space_operations isofs_symlink_aops = {
- .readpage = rock_ridge_symlink_readpage
+ .read_folio = rock_ridge_symlink_read_folio
};
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 746132998c57..51bd38da21cd 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -203,7 +203,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
tid_t this_tid;
int result, batch_count = 0;
- jbd_debug(1, "Start checkpoint\n");
+ jbd2_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
@@ -212,7 +212,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
*/
result = jbd2_cleanup_journal_tail(journal);
trace_jbd2_checkpoint(journal, result);
- jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+ jbd2_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
@@ -804,5 +804,5 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
trace_jbd2_drop_transaction(journal, transaction);
- jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+ jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5b9408e3b370..885a7a6cc53e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -62,6 +62,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
*/
static void release_buffer_page(struct buffer_head *bh)
{
+ struct folio *folio;
struct page *page;
if (buffer_dirty(bh))
@@ -71,18 +72,19 @@ static void release_buffer_page(struct buffer_head *bh)
page = bh->b_page;
if (!page)
goto nope;
- if (page->mapping)
+ folio = page_folio(page);
+ if (folio->mapping)
goto nope;
/* OK, it's a truncated page */
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto nope;
- get_page(page);
+ folio_get(folio);
__brelse(bh);
- try_to_free_buffers(page);
- unlock_page(page);
- put_page(page);
+ try_to_free_buffers(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return;
nope:
@@ -120,8 +122,8 @@ static int journal_submit_commit_record(journal_t *journal,
{
struct commit_header *tmp;
struct buffer_head *bh;
- int ret;
struct timespec64 now;
+ blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC;
*cbh = NULL;
@@ -153,13 +155,11 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
- ret = submit_bh(REQ_OP_WRITE,
- REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
- else
- ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+ write_flags |= REQ_PREFLUSH | REQ_FUA;
+ submit_bh(write_flags, bh);
*cbh = bh;
- return ret;
+ return 0;
}
/*
@@ -419,7 +419,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Do we need to erase the effects of a prior jbd2_journal_flush? */
if (journal->j_flags & JBD2_FLUSHED) {
- jbd_debug(3, "super block updated\n");
+ jbd2_debug(3, "super block updated\n");
mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* We hold j_checkpoint_mutex so tail cannot change under us.
@@ -433,7 +433,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
REQ_SYNC);
mutex_unlock(&journal->j_checkpoint_mutex);
} else {
- jbd_debug(3, "superblock not updated\n");
+ jbd2_debug(3, "superblock not updated\n");
}
J_ASSERT(journal->j_running_transaction != NULL);
@@ -465,7 +465,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction = journal->j_running_transaction;
trace_jbd2_start_commit(journal, commit_transaction);
- jbd_debug(1, "JBD2: starting commit of transaction %d\n",
+ jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
@@ -488,7 +488,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_wait_updates(journal);
commit_transaction->t_state = T_SWITCH;
- write_unlock(&journal->j_state_lock);
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -508,6 +507,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
* buffer are perfectly permissible.
+ * We use journal->j_state_lock here to serialize processing of
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
@@ -527,6 +528,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_refile_buffer(journal, jh);
}
+ write_unlock(&journal->j_state_lock);
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
@@ -536,7 +538,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
__jbd2_journal_clean_checkpoint_list(journal, false);
spin_unlock(&journal->j_list_lock);
- jbd_debug(3, "JBD2: commit phase 1\n");
+ jbd2_debug(3, "JBD2: commit phase 1\n");
/*
* Clear revoked flag to reflect there is no revoked buffers
@@ -549,13 +551,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ write_lock(&journal->j_state_lock);
/*
* Reserved credits cannot be claimed anymore, free them
*/
atomic_sub(atomic_read(&journal->j_reserved_credits),
&commit_transaction->t_outstanding_credits);
- write_lock(&journal->j_state_lock);
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -566,10 +568,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2a\n");
+ jbd2_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -582,7 +584,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
- jbd_debug(3, "JBD2: commit phase 2b\n");
+ jbd2_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -638,7 +640,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (!descriptor) {
J_ASSERT (bufs == 0);
- jbd_debug(4, "JBD2: get descriptor\n");
+ jbd2_debug(4, "JBD2: get descriptor\n");
descriptor = jbd2_journal_get_descriptor_buffer(
commit_transaction,
@@ -648,7 +650,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
+ jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
(unsigned long long)descriptor->b_blocknr,
descriptor->b_data);
tagp = &descriptor->b_data[sizeof(journal_header_t)];
@@ -733,7 +735,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_buffers == NULL ||
space_left < tag_bytes + 16 + csum_size) {
- jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
+ jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
/* Write an end-of-descriptor marker before
submitting the IOs. "tag" still points to
@@ -759,7 +761,7 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
+ submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
}
cond_resched();
@@ -835,7 +837,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD2: commit phase 3\n");
+ jbd2_debug(3, "JBD2: commit phase 3\n");
while (!list_empty(&io_bufs)) {
struct buffer_head *bh = list_entry(io_bufs.prev,
@@ -878,7 +880,7 @@ start_journal_io:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD2: commit phase 4\n");
+ jbd2_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
while (!list_empty(&log_bufs)) {
@@ -902,7 +904,7 @@ start_journal_io:
if (err)
jbd2_journal_abort(journal, err);
- jbd_debug(3, "JBD2: commit phase 5\n");
+ jbd2_debug(3, "JBD2: commit phase 5\n");
write_lock(&journal->j_state_lock);
J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
commit_transaction->t_state = T_COMMIT_JFLUSH;
@@ -941,7 +943,7 @@ start_journal_io:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD2: commit phase 6\n");
+ jbd2_debug(3, "JBD2: commit phase 6\n");
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
@@ -1118,7 +1120,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD2: commit phase 7\n");
+ jbd2_debug(3, "JBD2: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
@@ -1160,7 +1162,7 @@ restart_loop:
journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
trace_jbd2_end_commit(journal, commit_transaction);
- jbd_debug(1, "JBD2: commit %d complete, head %d\n",
+ jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fcacafa4510d..bc8270e0d7d0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -49,8 +49,7 @@
#include <asm/page.h>
#ifdef CONFIG_JBD2_DEBUG
-ushort jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
+static ushort jbd2_journal_enable_debug __read_mostly;
module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
@@ -81,7 +80,6 @@ EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
-EXPORT_SYMBOL(jbd2_log_start_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -115,7 +113,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
va_end(args);
}
-EXPORT_SYMBOL(__jbd2_debug);
#endif
/* Checksumming functions */
@@ -203,11 +200,11 @@ loop:
if (journal->j_flags & JBD2_UNMOUNT)
goto end_loop;
- jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
+ jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
journal->j_commit_sequence, journal->j_commit_request);
if (journal->j_commit_sequence != journal->j_commit_request) {
- jbd_debug(1, "OK, requests differ\n");
+ jbd2_debug(1, "OK, requests differ\n");
write_unlock(&journal->j_state_lock);
del_timer_sync(&journal->j_commit_timer);
jbd2_journal_commit_transaction(journal);
@@ -222,7 +219,7 @@ loop:
* good idea, because that depends on threads that may
* be already stopped.
*/
- jbd_debug(1, "Now suspending kjournald2\n");
+ jbd2_debug(1, "Now suspending kjournald2\n");
write_unlock(&journal->j_state_lock);
try_to_freeze();
write_lock(&journal->j_state_lock);
@@ -252,7 +249,7 @@ loop:
finish_wait(&journal->j_wait_commit, &wait);
}
- jbd_debug(1, "kjournald2 wakes\n");
+ jbd2_debug(1, "kjournald2 wakes\n");
/*
* Were we woken up by a commit wakeup event?
@@ -260,7 +257,7 @@ loop:
transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
journal->j_commit_request = transaction->t_tid;
- jbd_debug(1, "woke because of timeout\n");
+ jbd2_debug(1, "woke because of timeout\n");
}
goto loop;
@@ -268,7 +265,7 @@ end_loop:
del_timer_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
- jbd_debug(1, "Journal thread exiting.\n");
+ jbd2_debug(1, "Journal thread exiting.\n");
write_unlock(&journal->j_state_lock);
return 0;
}
@@ -481,7 +478,7 @@ repeat:
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
-int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
/* Return if the txn has already requested to be committed */
if (journal->j_commit_request == target)
@@ -500,7 +497,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
*/
journal->j_commit_request = target;
- jbd_debug(1, "JBD2: requesting commit %u/%u\n",
+ jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
journal->j_commit_request,
journal->j_commit_sequence);
journal->j_running_transaction->t_requested = jiffies;
@@ -705,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
}
#endif
while (tid_gt(tid, journal->j_commit_sequence)) {
- jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
+ jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
tid, journal->j_commit_sequence);
read_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_commit);
@@ -926,10 +923,16 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
bh = journal->j_fc_wbuf[i];
wait_on_buffer(bh);
+ /*
+ * Update j_fc_off so jbd2_fc_release_bufs can release remain
+ * buffer head.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ journal->j_fc_off = i + 1;
+ return -EIO;
+ }
put_bh(bh);
journal->j_fc_wbuf[i] = NULL;
- if (unlikely(!buffer_uptodate(bh)))
- return -EIO;
}
return 0;
@@ -1117,7 +1120,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
freed += journal->j_last - journal->j_first;
trace_jbd2_update_log_tail(journal, tid, block, freed);
- jbd_debug(1,
+ jbd2_debug(1,
"Cleaning journal tail from %u to %u (offset %lu), "
"freeing %lu\n",
journal->j_tail_sequence, tid, block, freed);
@@ -1418,7 +1421,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
goto err_cleanup;
- if (register_shrinker(&journal->j_shrinker)) {
+ if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
+ MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
percpu_counter_destroy(&journal->j_checkpoint_jh_count);
goto err_cleanup;
}
@@ -1465,7 +1469,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev,
if (!journal)
return NULL;
- bdevname(journal->j_dev, journal->j_devname);
+ snprintf(journal->j_devname, sizeof(journal->j_devname),
+ "%pg", journal->j_dev);
strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);
@@ -1496,7 +1501,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
return NULL;
}
- jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
+ jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
@@ -1507,7 +1512,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
return NULL;
journal->j_inode = inode;
- bdevname(journal->j_dev, journal->j_devname);
+ snprintf(journal->j_devname, sizeof(journal->j_devname),
+ "%pg", journal->j_dev);
p = strreplace(journal->j_devname, '/', '!');
sprintf(p, "-%lu", journal->j_inode->i_ino);
jbd2_stats_proc_init(journal);
@@ -1575,7 +1581,7 @@ static int journal_reset(journal_t *journal)
* attempting a write to a potential-readonly device.
*/
if (sb->s_start == 0) {
- jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
+ jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
"(start %ld, seq %u, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
@@ -1602,11 +1608,11 @@ static int journal_reset(journal_t *journal)
* This function expects that the caller will have locked the journal
* buffer head, and will return with it unlocked
*/
-static int jbd2_write_superblock(journal_t *journal, int write_flags)
+static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
- int ret;
+ int ret = 0;
/* Buffer got discarded which means block device got invalidated */
if (!buffer_mapped(bh)) {
@@ -1636,7 +1642,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
sb->s_checksum = jbd2_superblock_csum(journal, sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
+ submit_bh(REQ_OP_WRITE | write_flags, bh);
wait_on_buffer(bh);
if (buffer_write_io_error(bh)) {
clear_buffer_write_io_error(bh);
@@ -1644,9 +1650,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
ret = -EIO;
}
if (ret) {
- printk(KERN_ERR "JBD2: Error %d detected when updating "
- "journal superblock for %s.\n", ret,
- journal->j_devname);
+ printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n",
+ journal->j_devname);
if (!is_journal_aborted(journal))
jbd2_journal_abort(journal, ret);
}
@@ -1659,13 +1664,14 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
* @journal: The journal to update.
* @tail_tid: TID of the new transaction at the tail of the log
* @tail_block: The first block of the transaction at the tail of the log
- * @write_op: With which operation should we write the journal sb
+ * @write_flags: Flags for the journal sb write operation
*
* Update a journal's superblock information about log tail and write it to
* disk, waiting for the IO to complete.
*/
int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
- unsigned long tail_block, int write_op)
+ unsigned long tail_block,
+ blk_opf_t write_flags)
{
journal_superblock_t *sb = journal->j_superblock;
int ret;
@@ -1678,14 +1684,14 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
}
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+ jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
tail_block, tail_tid);
lock_buffer(journal->j_sb_buffer);
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start = cpu_to_be32(tail_block);
- ret = jbd2_write_superblock(journal, write_op);
+ ret = jbd2_write_superblock(journal, write_flags);
if (ret)
goto out;
@@ -1702,12 +1708,12 @@ out:
/**
* jbd2_mark_journal_empty() - Mark on disk journal as empty.
* @journal: The journal to update.
- * @write_op: With which operation should we write the journal sb
+ * @write_flags: Flags for the journal sb write operation
*
* Update a journal's dynamic superblock fields to show that journal is empty.
* Write updated superblock to disk waiting for IO to complete.
*/
-static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
+static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
{
journal_superblock_t *sb = journal->j_superblock;
bool had_fast_commit = false;
@@ -1719,7 +1725,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
return;
}
- jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
+ jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1733,7 +1739,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
had_fast_commit = true;
}
- jbd2_write_superblock(journal, write_op);
+ jbd2_write_superblock(journal, write_flags);
if (had_fast_commit)
jbd2_set_feature_fast_commit(journal);
@@ -1762,7 +1768,6 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
unsigned long block, log_offset; /* logical */
unsigned long long phys_block, block_start, block_stop; /* physical */
loff_t byte_start, byte_stop, byte_count;
- struct request_queue *q = bdev_get_queue(journal->j_dev);
/* flags must be set to either discard or zeroout */
if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
@@ -1770,10 +1775,8 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
return -EINVAL;
- if (!q)
- return -ENXIO;
-
- if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ !bdev_max_discard_sectors(journal->j_dev))
return -EOPNOTSUPP;
/*
@@ -1828,7 +1831,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
err = blkdev_issue_discard(journal->j_dev,
byte_start >> SECTOR_SHIFT,
byte_count >> SECTOR_SHIFT,
- GFP_NOFS, 0);
+ GFP_NOFS);
} else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
err = blkdev_issue_zeroout(journal->j_dev,
byte_start >> SECTOR_SHIFT,
@@ -1865,7 +1868,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
errcode = journal->j_errno;
if (errcode == -ESHUTDOWN)
errcode = 0;
- jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
+ jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
sb->s_errno = cpu_to_be32(errcode);
jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
@@ -1901,7 +1904,7 @@ static int journal_get_superblock(journal_t *journal)
J_ASSERT(bh != NULL);
if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
printk(KERN_ERR
@@ -2337,7 +2340,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
compat & JBD2_FEATURE_COMPAT_CHECKSUM)
compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
- jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+ jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
@@ -2406,7 +2409,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
{
journal_superblock_t *sb;
- jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+ jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
@@ -2863,7 +2866,7 @@ static struct journal_head *journal_alloc_journal_head(void)
#endif
ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
- jbd_debug(1, "out of memory for journal_head\n");
+ jbd2_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 8ca3527189f8..3688d16fe83b 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -100,7 +100,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
- ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
+ ll_rw_block(REQ_OP_READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
@@ -109,7 +109,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
if (nbufs)
- ll_rw_block(REQ_OP_READ, 0, nbufs, bufs);
+ ll_rw_block(REQ_OP_READ, nbufs, bufs);
err = 0;
failed:
@@ -245,17 +245,18 @@ static int fc_do_one_pass(journal_t *journal,
return 0;
while (next_fc_block <= journal->j_fc_last) {
- jbd_debug(3, "Fast commit replay: next block %ld\n",
+ jbd2_debug(3, "Fast commit replay: next block %ld\n",
next_fc_block);
err = jread(&bh, journal, next_fc_block);
if (err) {
- jbd_debug(3, "Fast commit replay: read error\n");
+ jbd2_debug(3, "Fast commit replay: read error\n");
break;
}
err = journal->j_fc_replay_callback(journal, bh, pass,
next_fc_block - journal->j_fc_first,
expected_commit_id);
+ brelse(bh);
next_fc_block++;
if (err < 0 || err == JBD2_FC_REPLAY_STOP)
break;
@@ -263,7 +264,7 @@ static int fc_do_one_pass(journal_t *journal,
}
if (err)
- jbd_debug(3, "Fast commit replay failed, err = %d\n", err);
+ jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
return err;
}
@@ -297,7 +298,7 @@ int jbd2_journal_recover(journal_t *journal)
*/
if (!sb->s_start) {
- jbd_debug(1, "No recovery required, last transaction %d\n",
+ jbd2_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
@@ -309,10 +310,10 @@ int jbd2_journal_recover(journal_t *journal)
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
- jbd_debug(1, "JBD2: recovery, exit status %d, "
+ jbd2_debug(1, "JBD2: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
- jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
+ jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
@@ -362,7 +363,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
#ifdef CONFIG_JBD2_DEBUG
int dropped = info.end_transaction -
be32_to_cpu(journal->j_superblock->s_sequence);
- jbd_debug(1,
+ jbd2_debug(1,
"JBD2: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
#endif
@@ -484,7 +485,7 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
- jbd_debug(1, "Starting recovery pass %d\n", pass);
+ jbd2_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
@@ -510,7 +511,7 @@ static int do_one_pass(journal_t *journal,
if (tid_geq(next_commit_ID, info->end_transaction))
break;
- jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block,
jbd2_has_feature_fast_commit(journal) ?
journal->j_fc_last : journal->j_last);
@@ -519,7 +520,7 @@ static int do_one_pass(journal_t *journal,
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD2: checking block %ld\n", next_log_block);
+ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -542,7 +543,7 @@ static int do_one_pass(journal_t *journal,
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
- jbd_debug(3, "Found magic %d, sequence %d\n",
+ jbd2_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
@@ -575,7 +576,7 @@ static int do_one_pass(journal_t *journal,
goto failed;
}
need_check_commit_time = true;
- jbd_debug(1,
+ jbd2_debug(1,
"invalid descriptor block found in %lu\n",
next_log_block);
}
@@ -758,7 +759,7 @@ static int do_one_pass(journal_t *journal,
* It likely does not belong to same journal,
* just end this recovery with success.
*/
- jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
+ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
next_commit_ID);
brelse(bh);
goto done;
@@ -826,7 +827,7 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN &&
!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
- jbd_debug(1, "JBD2: invalid revoke block found in %lu\n",
+ jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
next_log_block);
need_check_commit_time = true;
}
@@ -845,7 +846,7 @@ static int do_one_pass(journal_t *journal,
continue;
default:
- jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+ jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index fa608788b93d..4556e4689024 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -398,7 +398,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
}
handle->h_revoke_credits--;
- jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+ jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
@@ -428,7 +428,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
- jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+ jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
@@ -444,7 +444,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
- jbd_debug(4, "cancelled existing revoke on "
+ jbd2_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
@@ -560,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction,
}
if (descriptor)
flush_descriptor(journal, descriptor, offset);
- jbd_debug(1, "Wrote %d revoke records\n", count);
+ jbd2_debug(1, "Wrote %d revoke records\n", count);
}
/*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index fcb9175016a5..6a404ac1c178 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -168,7 +168,7 @@ static void wait_transaction_locked(journal_t *journal)
int need_to_start;
tid_t tid = journal->j_running_transaction->t_tid;
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
@@ -194,7 +194,7 @@ static void wait_transaction_switching(journal_t *journal)
read_unlock(&journal->j_state_lock);
return;
}
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
read_unlock(&journal->j_state_lock);
/*
@@ -373,7 +373,7 @@ alloc_transaction:
return -ENOMEM;
}
- jbd_debug(3, "New handle %p going live.\n", handle);
+ jbd2_debug(3, "New handle %p going live.\n", handle);
/*
* We need to hold j_state_lock until t_updates has been incremented,
@@ -453,7 +453,7 @@ repeat:
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
jbd2_log_space_left(journal));
@@ -674,7 +674,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
/* Don't extend a locked-down transaction! */
if (transaction->t_state != T_RUNNING) {
- jbd_debug(3, "denied handle %p %d blocks: "
+ jbd2_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
@@ -689,7 +689,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
&transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
- jbd_debug(3, "denied handle %p %d blocks: "
+ jbd2_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto error_out;
@@ -707,7 +707,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
handle->h_revoke_credits_requested += revoke_records;
result = 0;
- jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+ jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
error_out:
read_unlock(&journal->j_state_lock);
return result;
@@ -795,7 +795,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
* First unlink the handle from its current transaction, and start the
* commit on that.
*/
- jbd_debug(2, "restarting handle %p\n", handle);
+ jbd2_debug(2, "restarting handle %p\n", handle);
stop_this_handle(handle);
handle->h_transaction = NULL;
@@ -920,7 +920,7 @@ void jbd2_journal_unlock_updates (journal_t *journal)
write_lock(&journal->j_state_lock);
--journal->j_barrier_count;
write_unlock(&journal->j_state_lock);
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
}
static void warn_dirty_buffer(struct buffer_head *bh)
@@ -979,7 +979,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
journal = transaction->t_journal;
- jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
+ jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
JBUFFER_TRACE(jh, "entry");
repeat:
@@ -1271,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
- jbd_debug(5, "journal_head %p\n", jh);
+ jbd2_debug(5, "journal_head %p\n", jh);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
@@ -1486,8 +1486,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int ret = 0;
- if (is_handle_aborted(handle))
- return -EROFS;
if (!buffer_jbd(bh))
return -EUCLEAN;
@@ -1496,7 +1494,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* of the running transaction.
*/
jh = bh2jh(bh);
- jbd_debug(5, "journal_head %p\n", jh);
+ jbd2_debug(5, "journal_head %p\n", jh);
JBUFFER_TRACE(jh, "entry");
/*
@@ -1534,6 +1532,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
journal = transaction->t_journal;
spin_lock(&jh->b_state_lock);
+ if (is_handle_aborted(handle)) {
+ /*
+ * Check journal aborting with @jh->b_state_lock locked,
+ * since 'jh->b_transaction' could be replaced with
+ * 'jh->b_next_transaction' during old transaction
+ * committing if journal aborted, which may fail
+ * assertion on 'jh->b_frozen_data == NULL'.
+ */
+ ret = -EROFS;
+ goto out_unlock_bh;
+ }
+
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
@@ -1818,7 +1828,7 @@ int jbd2_journal_stop(handle_t *handle)
pid_t pid;
if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
handle->h_ref);
if (is_handle_aborted(handle))
return -EIO;
@@ -1838,7 +1848,7 @@ int jbd2_journal_stop(handle_t *handle)
if (is_handle_aborted(handle))
err = -EIO;
- jbd_debug(4, "Handle %p going down\n", handle);
+ jbd2_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
tid, handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
@@ -1916,7 +1926,7 @@ int jbd2_journal_stop(handle_t *handle)
* completes the commit thread, it just doesn't write
* anything to disk. */
- jbd_debug(2, "transaction too old, requesting commit for "
+ jbd2_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
jbd2_log_start_commit(journal, tid);
@@ -2114,7 +2124,7 @@ out:
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
- * @page: to try and free
+ * @folio: Folio to detach data from.
*
* For all the buffers on this page,
* if they are fully written out ordered data, move them onto BUF_CLEAN
@@ -2143,17 +2153,17 @@ out:
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
*
- * Return 0 on failure, 1 on success
+ * Return false on failure, true on success
*/
-int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
+bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
{
struct buffer_head *head;
struct buffer_head *bh;
- int ret = 0;
+ bool ret = false;
- J_ASSERT(PageLocked(page));
+ J_ASSERT(folio_test_locked(folio));
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
do {
struct journal_head *jh;
@@ -2175,7 +2185,7 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
goto busy;
} while ((bh = bh->b_this_page) != head);
- ret = try_to_free_buffers(page);
+ ret = try_to_free_buffers(folio);
busy:
return ret;
}
@@ -2482,7 +2492,7 @@ int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
} while (bh != head);
if (!partial_page) {
- if (may_free && try_to_free_buffers(&folio->page))
+ if (may_free && try_to_free_buffers(folio))
J_ASSERT(!folio_buffers(folio));
}
return 0;
@@ -2662,7 +2672,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
return -EROFS;
journal = transaction->t_journal;
- jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);
spin_lock(&journal->j_list_lock);
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 7e9abdb89712..acd32f05b519 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -43,9 +43,9 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n",
__func__,
jeb->offset, jeb->offset, jeb->offset + c->sector_size);
- instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL);
+ instr = kzalloc(sizeof(struct erase_info), GFP_KERNEL);
if (!instr) {
- pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
+ pr_warn("kzalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
mutex_lock(&c->erase_free_sem);
spin_lock(&c->erase_completion_lock);
list_move(&jeb->list, &c->erase_pending_list);
@@ -57,8 +57,6 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
return;
}
- memset(instr, 0, sizeof(*instr));
-
instr->addr = jeb->offset;
instr->len = c->sector_size;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bd7d58d27bfc..ba86acbe12d3 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -25,9 +25,9 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *pg, void *fsdata);
static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata);
-static int jffs2_readpage (struct file *filp, struct page *pg);
+static int jffs2_read_folio(struct file *filp, struct folio *folio);
int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
@@ -72,7 +72,7 @@ const struct inode_operations jffs2_file_inode_operations =
const struct address_space_operations jffs2_file_address_operations =
{
- .readpage = jffs2_readpage,
+ .read_folio = jffs2_read_folio,
.write_begin = jffs2_write_begin,
.write_end = jffs2_write_end,
};
@@ -110,27 +110,26 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
return ret;
}
-int jffs2_do_readpage_unlock(void *data, struct page *pg)
+int __jffs2_read_folio(struct file *file, struct folio *folio)
{
- int ret = jffs2_do_readpage_nolock(data, pg);
- unlock_page(pg);
+ int ret = jffs2_do_readpage_nolock(folio->mapping->host, &folio->page);
+ folio_unlock(folio);
return ret;
}
-
-static int jffs2_readpage (struct file *filp, struct page *pg)
+static int jffs2_read_folio(struct file *file, struct folio *folio)
{
- struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host);
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(folio->mapping->host);
int ret;
mutex_lock(&f->sem);
- ret = jffs2_do_readpage_unlock(pg->mapping->host, pg);
+ ret = __jffs2_read_folio(file, folio);
mutex_unlock(&f->sem);
return ret;
}
static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct page *pg;
@@ -213,7 +212,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* page in read_cache_page(), which causes a deadlock.
*/
mutex_lock(&c->alloc_sem);
- pg = grab_cache_page_write_begin(mapping, index, flags);
+ pg = grab_cache_page_write_begin(mapping, index);
if (!pg) {
ret = -ENOMEM;
goto release_sem;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 71f03a5d36ed..39cec28096a7 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -178,7 +178,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
jffs2_complete_reservation(c);
/* We have to do the truncate_setsize() without f->sem held, since
- some pages may be locked and waiting for it in readpage().
+ some pages may be locked and waiting for it in read_folio().
We are protected from a simultaneous write() extending i_size
back past iattr->ia_size, because do_truncate() holds the
generic inode semaphore. */
@@ -604,6 +604,7 @@ out_root:
jffs2_free_raw_node_refs(c);
kvfree(c->blocks);
jffs2_clear_xattr_subsystem(c);
+ jffs2_sum_exit(c);
out_inohash:
kfree(c->inocache_list);
out_wbuf:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 373b3b7c9f44..5c6602f3c189 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1327,7 +1327,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
* trying to write out, read_cache_page() will not deadlock. */
mutex_unlock(&f->sem);
page = read_cache_page(inode->i_mapping, start >> PAGE_SHIFT,
- jffs2_do_readpage_unlock, inode);
+ __jffs2_read_folio, NULL);
if (IS_ERR(page)) {
pr_warn("read_cache_page() returned error: %ld\n",
PTR_ERR(page));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 173eccac691d..921d782583d6 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -155,7 +155,7 @@ extern const struct file_operations jffs2_file_operations;
extern const struct inode_operations jffs2_file_inode_operations;
extern const struct address_space_operations jffs2_file_address_operations;
int jffs2_fsync(struct file *, loff_t, loff_t, int);
-int jffs2_do_readpage_unlock(void *data, struct page *pg);
+int __jffs2_read_folio(struct file *file, struct folio *folio);
/* ioctl.c */
long jffs2_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index c6821a509481..4061e0ba7010 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
{
int i, ret;
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
ops.mode = MTD_OPS_AUTO_OOB;
ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
@@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
struct jffs2_eraseblock *jeb)
{
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
ops.mode = MTD_OPS_AUTO_OOB;
@@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
struct jffs2_eraseblock *jeb)
{
int ret;
- struct mtd_oob_ops ops;
+ struct mtd_oob_ops ops = { };
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
ops.mode = MTD_OPS_AUTO_OOB;
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 285ec189ed5c..7156d2c218c7 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -13,5 +13,3 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
resize.o xattr.o ioctl.o
jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
-
-ccflags-y := -D_JFS_4K
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 1d732fd223d4..332dc9ac47a9 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (rc)
return rc;
- if (is_quota_modification(inode, iattr)) {
+ if (is_quota_modification(mnt_userns, inode, iattr)) {
rc = dquot_initialize(inode);
if (rc)
return rc;
}
if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
(iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
- rc = dquot_transfer(inode, iattr);
+ rc = dquot_transfer(mnt_userns, inode, iattr);
if (rc)
return rc;
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index d1943a7b4b04..d1ec920aa030 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -224,18 +224,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
* this as a hole
*/
goto unlock;
-#ifdef _JFS_4K
XADoffset(&xad, lblock64);
XADlength(&xad, xlen);
XADaddress(&xad, xaddr);
-#else /* _JFS_4K */
- /*
- * As long as block size = 4K, this isn't a problem.
- * We should mark the whole page not ABNR, but how
- * will we know to mark the other blocks BH_New?
- */
- BUG();
-#endif /* _JFS_4K */
rc = extRecord(ip, &xad);
if (rc)
goto unlock;
@@ -252,7 +243,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
/*
* Allocate a new block
*/
-#ifdef _JFS_4K
if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
goto unlock;
rc = extAlloc(ip, xlen, lblock64, &xad, false);
@@ -263,14 +253,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
map_bh(bh_result, ip->i_sb, addressXAD(&xad));
bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits;
-#else /* _JFS_4K */
- /*
- * We need to do whatever it takes to keep all but the last buffers
- * in 4K pages - see jfs_write.c
- */
- BUG();
-#endif /* _JFS_4K */
-
unlock:
/*
* Release lock on inode
@@ -293,9 +275,9 @@ static int jfs_writepages(struct address_space *mapping,
return mpage_writepages(mapping, wbc, jfs_get_block);
}
-static int jfs_readpage(struct file *file, struct page *page)
+static int jfs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, jfs_get_block);
+ return mpage_read_folio(folio, jfs_get_block);
}
static void jfs_readahead(struct readahead_control *rac)
@@ -314,19 +296,30 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
}
static int jfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
- jfs_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block);
if (unlikely(ret))
jfs_write_failed(mapping, pos + len);
return ret;
}
+static int jfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied, struct page *page,
+ void *fsdata)
+{
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ret < len)
+ jfs_write_failed(mapping, pos + len);
+ return ret;
+}
+
static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping, block, jfs_get_block);
@@ -360,12 +353,12 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations jfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = jfs_readpage,
+ .read_folio = jfs_read_folio,
.readahead = jfs_readahead,
.writepage = jfs_writepage,
.writepages = jfs_writepages,
.write_begin = jfs_write_begin,
- .write_end = nobh_write_end,
+ .write_end = jfs_write_end,
.bmap = jfs_bmap,
.direct_IO = jfs_direct_IO,
};
@@ -418,7 +411,7 @@ void jfs_truncate(struct inode *ip)
{
jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size);
- nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block);
+ block_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block);
IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
jfs_truncate_nolock(ip, ip->i_size);
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 03a845ab4f00..1e7b177ece60 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -110,14 +110,13 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
s64 ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q)) {
+ if (!bdev_max_discard_sectors(sb->s_bdev)) {
jfs_warn("FITRIM not supported on device");
return -EOPNOTSUPP;
}
@@ -127,7 +126,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
range.minlen = max_t(unsigned int, range.minlen,
- q->limits.discard_granularity);
+ bdev_discard_granularity(sb->s_bdev));
ret = jfs_ioc_trim(inode, &range);
if (ret < 0)
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d8502f4989d9..6b838d3ae7c2 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -385,7 +385,8 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
}
/* write the last buffer. */
- write_metapage(mp);
+ if (mp)
+ write_metapage(mp);
IREAD_UNLOCK(ipbmap);
@@ -868,74 +869,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
return (rc);
}
-#ifdef _NOTYET
-/*
- * NAME: dbAllocExact()
- *
- * FUNCTION: try to allocate the requested extent;
- *
- * PARAMETERS:
- * ip - pointer to in-core inode;
- * blkno - extent address;
- * nblocks - extent length;
- *
- * RETURN VALUES:
- * 0 - success
- * -ENOSPC - insufficient disk resources
- * -EIO - i/o error
- */
-int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
-{
- int rc;
- struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
- struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
- struct dmap *dp;
- s64 lblkno;
- struct metapage *mp;
-
- IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
-
- /*
- * validate extent request:
- *
- * note: defragfs policy:
- * max 64 blocks will be moved.
- * allocation request size must be satisfied from a single dmap.
- */
- if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
- IREAD_UNLOCK(ipbmap);
- return -EINVAL;
- }
-
- if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
- /* the free space is no longer available */
- IREAD_UNLOCK(ipbmap);
- return -ENOSPC;
- }
-
- /* read in the dmap covering the extent */
- lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
- mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
- if (mp == NULL) {
- IREAD_UNLOCK(ipbmap);
- return -EIO;
- }
- dp = (struct dmap *) mp->data;
-
- /* try to allocate the requested extent */
- rc = dbAllocNext(bmp, dp, blkno, nblocks);
-
- IREAD_UNLOCK(ipbmap);
-
- if (rc == 0)
- mark_metapage_dirty(mp);
-
- release_metapage(mp);
-
- return (rc);
-}
-#endif /* _NOTYET */
-
/*
* NAME: dbReAlloc()
*
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 837d42f61464..92b7c533407c 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -2423,304 +2423,6 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
return 0;
}
-#ifdef _NOTYET
-/*
- * NAME: dtRelocate()
- *
- * FUNCTION: relocate dtpage (internal or leaf) of directory;
- * This function is mainly used by defragfs utility.
- */
-int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
- s64 nxaddr)
-{
- int rc = 0;
- struct metapage *mp, *pmp, *lmp, *rmp;
- dtpage_t *p, *pp, *rp = 0, *lp= 0;
- s64 bn;
- int index;
- struct btstack btstack;
- pxd_t *pxd;
- s64 oxaddr, nextbn, prevbn;
- int xlen, xsize;
- struct tlock *tlck;
- struct dt_lock *dtlck;
- struct pxd_lock *pxdlock;
- s8 *stbl;
- struct lv *lv;
-
- oxaddr = addressPXD(opxd);
- xlen = lengthPXD(opxd);
-
- jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
- (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
- xlen);
-
- /*
- * 1. get the internal parent dtpage covering
- * router entry for the tartget page to be relocated;
- */
- rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
- if (rc)
- return rc;
-
- /* retrieve search result */
- DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
- jfs_info("dtRelocate: parent router entry validated.");
-
- /*
- * 2. relocate the target dtpage
- */
- /* read in the target page from src extent */
- DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
- if (rc) {
- /* release the pinned parent page */
- DT_PUTPAGE(pmp);
- return rc;
- }
-
- /*
- * read in sibling pages if any to update sibling pointers;
- */
- rmp = NULL;
- if (p->header.next) {
- nextbn = le64_to_cpu(p->header.next);
- DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
- if (rc) {
- DT_PUTPAGE(mp);
- DT_PUTPAGE(pmp);
- return (rc);
- }
- }
-
- lmp = NULL;
- if (p->header.prev) {
- prevbn = le64_to_cpu(p->header.prev);
- DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
- if (rc) {
- DT_PUTPAGE(mp);
- DT_PUTPAGE(pmp);
- if (rmp)
- DT_PUTPAGE(rmp);
- return (rc);
- }
- }
-
- /* at this point, all xtpages to be updated are in memory */
-
- /*
- * update sibling pointers of sibling dtpages if any;
- */
- if (lmp) {
- tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
- dtlck = (struct dt_lock *) & tlck->lock;
- /* linelock header */
- ASSERT(dtlck->index == 0);
- lv = & dtlck->lv[0];
- lv->offset = 0;
- lv->length = 1;
- dtlck->index++;
-
- lp->header.next = cpu_to_le64(nxaddr);
- DT_PUTPAGE(lmp);
- }
-
- if (rmp) {
- tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
- dtlck = (struct dt_lock *) & tlck->lock;
- /* linelock header */
- ASSERT(dtlck->index == 0);
- lv = & dtlck->lv[0];
- lv->offset = 0;
- lv->length = 1;
- dtlck->index++;
-
- rp->header.prev = cpu_to_le64(nxaddr);
- DT_PUTPAGE(rmp);
- }
-
- /*
- * update the target dtpage to be relocated
- *
- * write LOG_REDOPAGE of LOG_NEW type for dst page
- * for the whole target page (logredo() will apply
- * after image and update bmap for allocation of the
- * dst extent), and update bmap for allocation of
- * the dst extent;
- */
- tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
- dtlck = (struct dt_lock *) & tlck->lock;
- /* linelock header */
- ASSERT(dtlck->index == 0);
- lv = & dtlck->lv[0];
-
- /* update the self address in the dtpage header */
- pxd = &p->header.self;
- PXDaddress(pxd, nxaddr);
-
- /* the dst page is the same as the src page, i.e.,
- * linelock for afterimage of the whole page;
- */
- lv->offset = 0;
- lv->length = p->header.maxslot;
- dtlck->index++;
-
- /* update the buffer extent descriptor of the dtpage */
- xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
-
- /* unpin the relocated page */
- DT_PUTPAGE(mp);
- jfs_info("dtRelocate: target dtpage relocated.");
-
- /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
- * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
- * will also force a bmap update ).
- */
-
- /*
- * 3. acquire maplock for the source extent to be freed;
- */
- /* for dtpage relocation, write a LOG_NOREDOPAGE record
- * for the source dtpage (logredo() will init NoRedoPage
- * filter and will also update bmap for free of the source
- * dtpage), and upadte bmap for free of the source dtpage;
- */
- tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
- pxdlock = (struct pxd_lock *) & tlck->lock;
- pxdlock->flag = mlckFREEPXD;
- PXDaddress(&pxdlock->pxd, oxaddr);
- PXDlength(&pxdlock->pxd, xlen);
- pxdlock->index = 1;
-
- /*
- * 4. update the parent router entry for relocation;
- *
- * acquire tlck for the parent entry covering the target dtpage;
- * write LOG_REDOPAGE to apply after image only;
- */
- jfs_info("dtRelocate: update parent router entry.");
- tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
- dtlck = (struct dt_lock *) & tlck->lock;
- lv = & dtlck->lv[dtlck->index];
-
- /* update the PXD with the new address */
- stbl = DT_GETSTBL(pp);
- pxd = (pxd_t *) & pp->slot[stbl[index]];
- PXDaddress(pxd, nxaddr);
- lv->offset = stbl[index];
- lv->length = 1;
- dtlck->index++;
-
- /* unpin the parent dtpage */
- DT_PUTPAGE(pmp);
-
- return rc;
-}
-
-/*
- * NAME: dtSearchNode()
- *
- * FUNCTION: Search for an dtpage containing a specified address
- * This function is mainly used by defragfs utility.
- *
- * NOTE: Search result on stack, the found page is pinned at exit.
- * The result page must be an internal dtpage.
- * lmxaddr give the address of the left most page of the
- * dtree level, in which the required dtpage resides.
- */
-static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
- struct btstack * btstack)
-{
- int rc = 0;
- s64 bn;
- struct metapage *mp;
- dtpage_t *p;
- int psize = 288; /* initial in-line directory */
- s8 *stbl;
- int i;
- pxd_t *pxd;
- struct btframe *btsp;
-
- BT_CLR(btstack); /* reset stack */
-
- /*
- * descend tree to the level with specified leftmost page
- *
- * by convention, root bn = 0.
- */
- for (bn = 0;;) {
- /* get/pin the page to search */
- DT_GETPAGE(ip, bn, mp, psize, p, rc);
- if (rc)
- return rc;
-
- /* does the xaddr of leftmost page of the levevl
- * matches levevl search key ?
- */
- if (p->header.flag & BT_ROOT) {
- if (lmxaddr == 0)
- break;
- } else if (addressPXD(&p->header.self) == lmxaddr)
- break;
-
- /*
- * descend down to leftmost child page
- */
- if (p->header.flag & BT_LEAF) {
- DT_PUTPAGE(mp);
- return -ESTALE;
- }
-
- /* get the leftmost entry */
- stbl = DT_GETSTBL(p);
- pxd = (pxd_t *) & p->slot[stbl[0]];
-
- /* get the child page block address */
- bn = addressPXD(pxd);
- psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
- /* unpin the parent page */
- DT_PUTPAGE(mp);
- }
-
- /*
- * search each page at the current levevl
- */
- loop:
- stbl = DT_GETSTBL(p);
- for (i = 0; i < p->header.nextindex; i++) {
- pxd = (pxd_t *) & p->slot[stbl[i]];
-
- /* found the specified router entry */
- if (addressPXD(pxd) == addressPXD(kpxd) &&
- lengthPXD(pxd) == lengthPXD(kpxd)) {
- btsp = btstack->top;
- btsp->bn = bn;
- btsp->index = i;
- btsp->mp = mp;
-
- return 0;
- }
- }
-
- /* get the right sibling page if any */
- if (p->header.next)
- bn = le64_to_cpu(p->header.next);
- else {
- DT_PUTPAGE(mp);
- return -ESTALE;
- }
-
- /* unpin current page */
- DT_PUTPAGE(mp);
-
- /* get the right sibling page */
- DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- goto loop;
-}
-#endif /* _NOTYET */
-
/*
* dtRelink()
*
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index bb4a342a193d..ae99a7e232ee 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -16,9 +16,6 @@
* forward references
*/
static int extBalloc(struct inode *, s64, s64 *, s64 *);
-#ifdef _NOTYET
-static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
-#endif
static s64 extRoundDown(s64 nb);
#define DPD(a) (printk("(a): %d\n",(a)))
@@ -177,162 +174,6 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
return (0);
}
-
-#ifdef _NOTYET
-/*
- * NAME: extRealloc()
- *
- * FUNCTION: extend the allocation of a file extent containing a
- * partial back last page.
- *
- * PARAMETERS:
- * ip - the inode of the file.
- * cp - cbuf for the partial backed last page.
- * xlen - request size of the resulting extent.
- * xp - pointer to an xad. on successful exit, the xad
- * describes the newly allocated extent.
- * abnr - bool indicating whether the newly allocated extent
- * should be marked as allocated but not recorded.
- *
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error.
- * -ENOSPC - insufficient disk resources.
- */
-int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
-{
- struct super_block *sb = ip->i_sb;
- s64 xaddr, xlen, nxaddr, delta, xoff;
- s64 ntail, nextend, ninsert;
- int rc, nbperpage = JFS_SBI(sb)->nbperpage;
- int xflag;
-
- /* This blocks if we are low on resources */
- txBeginAnon(ip->i_sb);
-
- mutex_lock(&JFS_IP(ip)->commit_mutex);
- /* validate extent length */
- if (nxlen > MAXXLEN)
- nxlen = MAXXLEN;
-
- /* get the extend (partial) page's disk block address and
- * number of blocks.
- */
- xaddr = addressXAD(xp);
- xlen = lengthXAD(xp);
- xoff = offsetXAD(xp);
-
- /* if the extend page is abnr and if the request is for
- * the extent to be allocated and recorded,
- * make the page allocated and recorded.
- */
- if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
- xp->flag = 0;
- if ((rc = xtUpdate(0, ip, xp)))
- goto exit;
- }
-
- /* try to allocated the request number of blocks for the
- * extent. dbRealloc() first tries to satisfy the request
- * by extending the allocation in place. otherwise, it will
- * try to allocate a new set of blocks large enough for the
- * request. in satisfying a request, dbReAlloc() may allocate
- * less than what was request but will always allocate enough
- * space as to satisfy the extend page.
- */
- if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
- goto exit;
-
- /* Allocat blocks to quota. */
- rc = dquot_alloc_block(ip, nxlen);
- if (rc) {
- dbFree(ip, nxaddr, (s64) nxlen);
- mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return rc;
- }
-
- delta = nxlen - xlen;
-
- /* check if the extend page is not abnr but the request is abnr
- * and the allocated disk space is for more than one page. if this
- * is the case, there is a miss match of abnr between the extend page
- * and the one or more pages following the extend page. as a result,
- * two extents will have to be manipulated. the first will be that
- * of the extent of the extend page and will be manipulated thru
- * an xtExtend() or an xtTailgate(), depending upon whether the
- * disk allocation occurred as an inplace extension. the second
- * extent will be manipulated (created) through an xtInsert() and
- * will be for the pages following the extend page.
- */
- if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) {
- ntail = nbperpage;
- nextend = ntail - xlen;
- ninsert = nxlen - nbperpage;
-
- xflag = XAD_NOTRECORDED;
- } else {
- ntail = nxlen;
- nextend = delta;
- ninsert = 0;
-
- xflag = xp->flag;
- }
-
- /* if we were able to extend the disk allocation in place,
- * extend the extent. otherwise, move the extent to a
- * new disk location.
- */
- if (xaddr == nxaddr) {
- /* extend the extent */
- if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
- dbFree(ip, xaddr + xlen, delta);
- dquot_free_block(ip, nxlen);
- goto exit;
- }
- } else {
- /*
- * move the extent to a new location:
- *
- * xtTailgate() accounts for relocated tail extent;
- */
- if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
- dbFree(ip, nxaddr, nxlen);
- dquot_free_block(ip, nxlen);
- goto exit;
- }
- }
-
-
- /* check if we need to also insert a new extent */
- if (ninsert) {
- /* perform the insert. if it fails, free the blocks
- * to be inserted and make it appear that we only did
- * the xtExtend() or xtTailgate() above.
- */
- xaddr = nxaddr + ntail;
- if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert,
- &xaddr, 0)) {
- dbFree(ip, xaddr, (s64) ninsert);
- delta = nextend;
- nxlen = ntail;
- xflag = 0;
- }
- }
-
- /* set the return results */
- XADaddress(xp, nxaddr);
- XADlength(xp, nxlen);
- XADoffset(xp, xoff);
- xp->flag = xflag;
-
- mark_inode_dirty(ip);
-exit:
- mutex_unlock(&JFS_IP(ip)->commit_mutex);
- return (rc);
-}
-#endif /* _NOTYET */
-
-
/*
* NAME: extHint()
*
@@ -423,44 +264,6 @@ int extRecord(struct inode *ip, xad_t * xp)
return rc;
}
-
-#ifdef _NOTYET
-/*
- * NAME: extFill()
- *
- * FUNCTION: allocate disk space for a file page that represents
- * a file hole.
- *
- * PARAMETERS:
- * ip - the inode of the file.
- * cp - cbuf of the file page represent the hole.
- *
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error.
- * -ENOSPC - insufficient disk resources.
- */
-int extFill(struct inode *ip, xad_t * xp)
-{
- int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
- s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
-
-// assert(ISSPARSE(ip));
-
- /* initialize the extent allocation hint */
- XADaddress(xp, 0);
-
- /* allocate an extent to fill the hole */
- if ((rc = extAlloc(ip, nbperpage, blkno, xp, false)))
- return (rc);
-
- assert(lengthPXD(xp) == nbperpage);
-
- return (0);
-}
-#endif /* _NOTYET */
-
-
/*
* NAME: extBalloc()
*
@@ -550,64 +353,6 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
return (0);
}
-
-#ifdef _NOTYET
-/*
- * NAME: extBrealloc()
- *
- * FUNCTION: attempt to extend an extent's allocation.
- *
- * Initially, we will try to extend the extent's allocation
- * in place. If this fails, we'll try to move the extent
- * to a new set of blocks. If moving the extent, we initially
- * will try to allocate disk blocks for the requested size
- * (newnblks). if this fails (new contiguous free blocks not
- * available), we'll try to allocate a smaller number of
- * blocks (producing a smaller extent), with this smaller
- * number of blocks consisting of the requested number of
- * blocks rounded down to the next smaller power of 2
- * number (i.e. 16 -> 8). We'll continue to round down and
- * retry the allocation until the number of blocks to allocate
- * is smaller than the number of blocks per page.
- *
- * PARAMETERS:
- * ip - the inode of the file.
- * blkno - starting block number of the extents current allocation.
- * nblks - number of blocks within the extents current allocation.
- * newnblks - pointer to a s64 value. on entry, this value is the
- * new desired extent size (number of blocks). on
- * successful exit, this value is set to the extent's actual
- * new size (new number of blocks).
- * newblkno - the starting block number of the extents new allocation.
- *
- * RETURN VALUES:
- * 0 - success
- * -EIO - i/o error.
- * -ENOSPC - insufficient disk resources.
- */
-static int
-extBrealloc(struct inode *ip,
- s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno)
-{
- int rc;
-
- /* try to extend in place */
- if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) {
- *newblkno = blkno;
- return (0);
- } else {
- if (rc != -ENOSPC)
- return (rc);
- }
-
- /* in place extension not possible.
- * try to move the extent to a new set of blocks.
- */
- return (extBalloc(ip, blkno, newnblks, newblkno));
-}
-#endif /* _NOTYET */
-
-
/*
* NAME: extRoundDown()
*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 997c81fcea34..695415cbfe98 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -388,14 +388,6 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
linelock = (struct linelock *) & tlck->lock;
}
-#ifdef _JFS_WIP
- else if (tlck->flag & tlckINLINELOCK) {
-
- inlinelock = (struct inlinelock *) & tlck;
- p = (caddr_t) & inlinelock->pxd;
- linelock = (struct linelock *) & tlck;
- }
-#endif /* _JFS_WIP */
else {
jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
return 0; /* Probably should trap */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index c4220ccdedef..2e8461ce74de 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -467,8 +467,9 @@ err_out:
return -EIO;
}
-static int metapage_readpage(struct file *fp, struct page *page)
+static int metapage_read_folio(struct file *fp, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct bio *bio = NULL;
int block_offset;
@@ -523,29 +524,29 @@ add_failed:
return -EIO;
}
-static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
+static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask)
{
struct metapage *mp;
- int ret = 1;
+ bool ret = true;
int offset;
for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
- mp = page_to_mp(page, offset);
+ mp = page_to_mp(&folio->page, offset);
if (!mp)
continue;
- jfs_info("metapage_releasepage: mp = 0x%p", mp);
+ jfs_info("metapage_release_folio: mp = 0x%p", mp);
if (mp->count || mp->nohomeok ||
test_bit(META_dirty, &mp->flag)) {
jfs_info("count = %ld, nohomeok = %d", mp->count,
mp->nohomeok);
- ret = 0;
+ ret = false;
continue;
}
if (mp->lsn)
remove_from_logsync(mp);
- remove_metapage(page, mp);
+ remove_metapage(&folio->page, mp);
INCREMENT(mpStat.pagefree);
free_metapage(mp);
}
@@ -559,13 +560,13 @@ static void metapage_invalidate_folio(struct folio *folio, size_t offset,
BUG_ON(folio_test_writeback(folio));
- metapage_releasepage(&folio->page, 0);
+ metapage_release_folio(folio, 0);
}
const struct address_space_operations jfs_metapage_aops = {
- .readpage = metapage_readpage,
+ .read_folio = metapage_read_folio,
.writepage = metapage_writepage,
- .releasepage = metapage_releasepage,
+ .release_folio = metapage_release_folio,
.invalidate_folio = metapage_invalidate_folio,
.dirty_folio = filemap_dirty_folio,
};
@@ -617,7 +618,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
SetPageUptodate(page);
} else {
page = read_mapping_page(mapping, page_index, NULL);
- if (IS_ERR(page) || !PageUptodate(page)) {
+ if (IS_ERR(page)) {
jfs_err("read_mapping_page failed!");
return NULL;
}
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index aa4ff7bcaff2..48d1f70f786c 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -307,13 +307,11 @@ static int chkSuper(struct super_block *sb)
}
bsize = le32_to_cpu(j_sb->s_bsize);
-#ifdef _JFS_4K
if (bsize != PSIZE) {
- jfs_err("Currently only 4K block size supported!");
+ jfs_err("Only 4K block size supported!");
rc = -EINVAL;
goto out;
}
-#endif /* _JFS_4K */
jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx",
le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 042bbe6d8ac2..ffd4feece078 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1490,40 +1490,6 @@ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
tlck->flag |= tlckWRITEPAGE;
} else
jfs_err("diLog: UFO type tlck:0x%p", tlck);
-#ifdef _JFS_WIP
- /*
- * alloc/free external EA extent
- *
- * a maplock for txUpdateMap() to update bPWMAP for alloc/free
- * of the extent has been formatted at txLock() time;
- */
- else {
- assert(tlck->type & tlckEA);
-
- /* log LOG_UPDATEMAP for logredo() to update bmap for
- * alloc of new (and free of old) external EA extent;
- */
- lrd->type = cpu_to_le16(LOG_UPDATEMAP);
- pxdlock = (struct pxd_lock *) & tlck->lock;
- nlock = pxdlock->index;
- for (i = 0; i < nlock; i++, pxdlock++) {
- if (pxdlock->flag & mlckALLOCPXD)
- lrd->log.updatemap.type =
- cpu_to_le16(LOG_ALLOCPXD);
- else
- lrd->log.updatemap.type =
- cpu_to_le16(LOG_FREEPXD);
- lrd->log.updatemap.nxd = cpu_to_le16(1);
- lrd->log.updatemap.pxd = pxdlock->pxd;
- lrd->backchain =
- cpu_to_le32(lmLog(log, tblk, lrd, NULL));
- }
-
- /* update bmap */
- tlck->flag |= tlckUPDATEMAP;
- }
-#endif /* _JFS_WIP */
-
return;
}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 3148e9b35f3b..2d304cee884c 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -114,17 +114,6 @@ static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
static int xtSplitRoot(tid_t tid, struct inode *ip,
struct xtsplit * split, struct metapage ** rmpp);
-#ifdef _STILL_TO_PORT
-static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
- xtpage_t * fp, struct btstack * btstack);
-
-static int xtSearchNode(struct inode *ip,
- xad_t * xad,
- int *cmpp, struct btstack * btstack, int flag);
-
-static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
-#endif /* _STILL_TO_PORT */
-
/*
* xtLookup()
*
@@ -1493,189 +1482,6 @@ int xtExtend(tid_t tid, /* transaction id */
return rc;
}
-#ifdef _NOTYET
-/*
- * xtTailgate()
- *
- * function: split existing 'tail' extent
- * (split offset >= start offset of tail extent), and
- * relocate and extend the split tail half;
- *
- * note: existing extent may or may not have been committed.
- * caller is responsible for pager buffer cache update, and
- * working block allocation map update;
- * update pmap: free old split tail extent, alloc new extent;
- */
-int xtTailgate(tid_t tid, /* transaction id */
- struct inode *ip, s64 xoff, /* split/new extent offset */
- s32 xlen, /* new extent length */
- s64 xaddr, /* new extent address */
- int flag)
-{
- int rc = 0;
- int cmp;
- struct metapage *mp; /* meta-page buffer */
- xtpage_t *p; /* base B+-tree index page */
- s64 bn;
- int index, nextindex, llen, rlen;
- struct btstack btstack; /* traverse stack */
- struct xtsplit split; /* split information */
- xad_t *xad;
- struct tlock *tlck;
- struct xtlock *xtlck = 0;
- struct tlock *mtlck;
- struct maplock *pxdlock;
-
-/*
-printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
- (ulong)xoff, xlen, (ulong)xaddr);
-*/
-
- /* there must exist extent to be tailgated */
- if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT)))
- return rc;
-
- /* retrieve search result */
- XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
-
- if (cmp != 0) {
- XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "couldn't find extent\n");
- return -EIO;
- }
-
- /* entry found must be last entry */
- nextindex = le16_to_cpu(p->header.nextindex);
- if (index != nextindex - 1) {
- XT_PUTPAGE(mp);
- jfs_error(ip->i_sb, "the entry found is not the last entry\n");
- return -EIO;
- }
-
- BT_MARK_DIRTY(mp, ip);
- /*
- * acquire tlock of the leaf page containing original entry
- */
- if (!test_cflag(COMMIT_Nolink, ip)) {
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
- xtlck = (struct xtlock *) & tlck->lock;
- }
-
- /* completely replace extent ? */
- xad = &p->xad[index];
-/*
-printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
- (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
-*/
- if ((llen = xoff - offsetXAD(xad)) == 0)
- goto updateOld;
-
- /*
- * partially replace extent: insert entry for new extent
- */
-//insertNew:
- /*
- * if the leaf page is full, insert the new entry and
- * propagate up the router entry for the new page from split
- *
- * The xtSplitUp() will insert the entry and unpin the leaf page.
- */
- if (nextindex == le16_to_cpu(p->header.maxentry)) {
- /* xtSpliUp() unpins leaf pages */
- split.mp = mp;
- split.index = index + 1;
- split.flag = XAD_NEW;
- split.off = xoff; /* split offset */
- split.len = xlen;
- split.addr = xaddr;
- split.pxdlist = NULL;
- if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
- return rc;
-
- /* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
- /*
- * if leaf root has been split, original root has been
- * copied to new child page, i.e., original entry now
- * resides on the new child page;
- */
- if (p->header.flag & BT_INTERNAL) {
- ASSERT(p->header.nextindex ==
- cpu_to_le16(XTENTRYSTART + 1));
- xad = &p->xad[XTENTRYSTART];
- bn = addressXAD(xad);
- XT_PUTPAGE(mp);
-
- /* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- BT_MARK_DIRTY(mp, ip);
- if (!test_cflag(COMMIT_Nolink, ip)) {
- tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
- xtlck = (struct xtlock *) & tlck->lock;
- }
- }
- }
- /*
- * insert the new entry into the leaf page
- */
- else {
- /* insert the new entry: mark the entry NEW */
- xad = &p->xad[index + 1];
- XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
-
- /* advance next available entry index */
- le16_add_cpu(&p->header.nextindex, 1);
- }
-
- /* get back old XAD */
- xad = &p->xad[index];
-
- /*
- * truncate/relocate old extent at split offset
- */
- updateOld:
- /* update dmap for old/committed/truncated extent */
- rlen = lengthXAD(xad) - llen;
- if (!(xad->flag & XAD_NEW)) {
- /* free from PWMAP at commit */
- if (!test_cflag(COMMIT_Nolink, ip)) {
- mtlck = txMaplock(tid, ip, tlckMAP);
- pxdlock = (struct maplock *) & mtlck->lock;
- pxdlock->flag = mlckFREEPXD;
- PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
- PXDlength(&pxdlock->pxd, rlen);
- pxdlock->index = 1;
- }
- } else
- /* free from WMAP */
- dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
-
- if (llen)
- /* truncate */
- XADlength(xad, llen);
- else
- /* replace */
- XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
-
- if (!test_cflag(COMMIT_Nolink, ip)) {
- xtlck->lwm.offset = (xtlck->lwm.offset) ?
- min(index, (int)xtlck->lwm.offset) : index;
- xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
- xtlck->lwm.offset;
- }
-
- /* unpin the leaf page */
- XT_PUTPAGE(mp);
-
- return rc;
-}
-#endif /* _NOTYET */
-
/*
* xtUpdate()
*
@@ -1753,32 +1559,12 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
newindex = index + 1;
nextindex = le16_to_cpu(p->header.nextindex);
-#ifdef _JFS_WIP_NOCOALESCE
- if (xoff < nxoff)
- goto updateRight;
-
- /*
- * replace XAD with nXAD
- */
- replace: /* (nxoff == xoff) */
- if (nxlen == xlen) {
- /* replace XAD with nXAD:recorded */
- *xad = *nxad;
- xad->flag = xflag & ~XAD_NOTRECORDED;
-
- goto out;
- } else /* (nxlen < xlen) */
- goto updateLeft;
-#endif /* _JFS_WIP_NOCOALESCE */
-
-/* #ifdef _JFS_WIP_COALESCE */
if (xoff < nxoff)
goto coalesceRight;
/*
* coalesce with left XAD
*/
-//coalesceLeft: /* (xoff == nxoff) */
/* is XAD first entry of page ? */
if (index == XTENTRYSTART)
goto replace;
@@ -1897,7 +1683,6 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
jfs_error(ip->i_sb, "xoff >= nxoff\n");
return -EIO;
}
-/* #endif _JFS_WIP_COALESCE */
/*
* split XAD into (lXAD, nXAD):
@@ -2305,752 +2090,6 @@ int xtAppend(tid_t tid, /* transaction id */
return rc;
}
-#ifdef _STILL_TO_PORT
-
-/* - TBD for defragmentaion/reorganization -
- *
- * xtDelete()
- *
- * function:
- * delete the entry with the specified key.
- *
- * N.B.: whole extent of the entry is assumed to be deleted.
- *
- * parameter:
- *
- * return:
- * ENOENT: if the entry is not found.
- *
- * exception:
- */
-int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
-{
- int rc = 0;
- struct btstack btstack;
- int cmp;
- s64 bn;
- struct metapage *mp;
- xtpage_t *p;
- int index, nextindex;
- struct tlock *tlck;
- struct xtlock *xtlck;
-
- /*
- * find the matching entry; xtSearch() pins the page
- */
- if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
- return rc;
-
- XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
- if (cmp) {
- /* unpin the leaf page */
- XT_PUTPAGE(mp);
- return -ENOENT;
- }
-
- /*
- * delete the entry from the leaf page
- */
- nextindex = le16_to_cpu(p->header.nextindex);
- le16_add_cpu(&p->header.nextindex, -1);
-
- /*
- * if the leaf page bocome empty, free the page
- */
- if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
- return (xtDeleteUp(tid, ip, mp, p, &btstack));
-
- BT_MARK_DIRTY(mp, ip);
- /*
- * acquire a transaction lock on the leaf page;
- *
- * action:xad deletion;
- */
- tlck = txLock(tid, ip, mp, tlckXTREE);
- xtlck = (struct xtlock *) & tlck->lock;
- xtlck->lwm.offset =
- (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
-
- /* if delete from middle, shift left/compact the remaining entries */
- if (index < nextindex - 1)
- memmove(&p->xad[index], &p->xad[index + 1],
- (nextindex - index - 1) * sizeof(xad_t));
-
- XT_PUTPAGE(mp);
-
- return 0;
-}
-
-
-/* - TBD for defragmentaion/reorganization -
- *
- * xtDeleteUp()
- *
- * function:
- * free empty pages as propagating deletion up the tree
- *
- * parameter:
- *
- * return:
- */
-static int
-xtDeleteUp(tid_t tid, struct inode *ip,
- struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
-{
- int rc = 0;
- struct metapage *mp;
- xtpage_t *p;
- int index, nextindex;
- s64 xaddr;
- int xlen;
- struct btframe *parent;
- struct tlock *tlck;
- struct xtlock *xtlck;
-
- /*
- * keep root leaf page which has become empty
- */
- if (fp->header.flag & BT_ROOT) {
- /* keep the root page */
- fp->header.flag &= ~BT_INTERNAL;
- fp->header.flag |= BT_LEAF;
- fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
-
- /* XT_PUTPAGE(fmp); */
-
- return 0;
- }
-
- /*
- * free non-root leaf page
- */
- if ((rc = xtRelink(tid, ip, fp))) {
- XT_PUTPAGE(fmp);
- return rc;
- }
-
- xaddr = addressPXD(&fp->header.self);
- xlen = lengthPXD(&fp->header.self);
- /* free the page extent */
- dbFree(ip, xaddr, (s64) xlen);
-
- /* free the buffer page */
- discard_metapage(fmp);
-
- /*
- * propagate page deletion up the index tree
- *
- * If the delete from the parent page makes it empty,
- * continue all the way up the tree.
- * stop if the root page is reached (which is never deleted) or
- * if the entry deletion does not empty the page.
- */
- while ((parent = BT_POP(btstack)) != NULL) {
- /* get/pin the parent page <sp> */
- XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- index = parent->index;
-
- /* delete the entry for the freed child page from parent.
- */
- nextindex = le16_to_cpu(p->header.nextindex);
-
- /*
- * the parent has the single entry being deleted:
- * free the parent page which has become empty.
- */
- if (nextindex == 1) {
- if (p->header.flag & BT_ROOT) {
- /* keep the root page */
- p->header.flag &= ~BT_INTERNAL;
- p->header.flag |= BT_LEAF;
- p->header.nextindex =
- cpu_to_le16(XTENTRYSTART);
-
- /* XT_PUTPAGE(mp); */
-
- break;
- } else {
- /* free the parent page */
- if ((rc = xtRelink(tid, ip, p)))
- return rc;
-
- xaddr = addressPXD(&p->header.self);
- /* free the page extent */
- dbFree(ip, xaddr,
- (s64) JFS_SBI(ip->i_sb)->nbperpage);
-
- /* unpin/free the buffer page */
- discard_metapage(mp);
-
- /* propagate up */
- continue;
- }
- }
- /*
- * the parent has other entries remaining:
- * delete the router entry from the parent page.
- */
- else {
- BT_MARK_DIRTY(mp, ip);
- /*
- * acquire a transaction lock on the leaf page;
- *
- * action:xad deletion;
- */
- tlck = txLock(tid, ip, mp, tlckXTREE);
- xtlck = (struct xtlock *) & tlck->lock;
- xtlck->lwm.offset =
- (xtlck->lwm.offset) ? min(index,
- xtlck->lwm.
- offset) : index;
-
- /* if delete from middle,
- * shift left/compact the remaining entries in the page
- */
- if (index < nextindex - 1)
- memmove(&p->xad[index], &p->xad[index + 1],
- (nextindex - index -
- 1) << L2XTSLOTSIZE);
-
- le16_add_cpu(&p->header.nextindex, -1);
- jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
- (ulong) parent->bn, index);
- }
-
- /* unpin the parent page */
- XT_PUTPAGE(mp);
-
- /* exit propagation up */
- break;
- }
-
- return 0;
-}
-
-
-/*
- * NAME: xtRelocate()
- *
- * FUNCTION: relocate xtpage or data extent of regular file;
- * This function is mainly used by defragfs utility.
- *
- * NOTE: This routine does not have the logic to handle
- * uncommitted allocated extent. The caller should call
- * txCommit() to commit all the allocation before call
- * this routine.
- */
-int
-xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
- s64 nxaddr, /* new xaddr */
- int xtype)
-{ /* extent type: XTPAGE or DATAEXT */
- int rc = 0;
- struct tblock *tblk;
- struct tlock *tlck;
- struct xtlock *xtlck;
- struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */
- xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */
- xad_t *xad;
- pxd_t *pxd;
- s64 xoff, xsize;
- int xlen;
- s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
- cbuf_t *cp;
- s64 offset, nbytes, nbrd, pno;
- int nb, npages, nblks;
- s64 bn;
- int cmp;
- int index;
- struct pxd_lock *pxdlock;
- struct btstack btstack; /* traverse stack */
-
- xtype = xtype & EXTENT_TYPE;
-
- xoff = offsetXAD(oxad);
- oxaddr = addressXAD(oxad);
- xlen = lengthXAD(oxad);
-
- /* validate extent offset */
- offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
- if (offset >= ip->i_size)
- return -ESTALE; /* stale extent */
-
- jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
- xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
-
- /*
- * 1. get and validate the parent xtpage/xad entry
- * covering the source extent to be relocated;
- */
- if (xtype == DATAEXT) {
- /* search in leaf entry */
- rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0);
- if (rc)
- return rc;
-
- /* retrieve search result */
- XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
-
- if (cmp) {
- XT_PUTPAGE(pmp);
- return -ESTALE;
- }
-
- /* validate for exact match with a single entry */
- xad = &pp->xad[index];
- if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
- XT_PUTPAGE(pmp);
- return -ESTALE;
- }
- } else { /* (xtype == XTPAGE) */
-
- /* search in internal entry */
- rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
- if (rc)
- return rc;
-
- /* retrieve search result */
- XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
-
- if (cmp) {
- XT_PUTPAGE(pmp);
- return -ESTALE;
- }
-
- /* xtSearchNode() validated for exact match with a single entry
- */
- xad = &pp->xad[index];
- }
- jfs_info("xtRelocate: parent xad entry validated.");
-
- /*
- * 2. relocate the extent
- */
- if (xtype == DATAEXT) {
- /* if the extent is allocated-but-not-recorded
- * there is no real data to be moved in this extent,
- */
- if (xad->flag & XAD_NOTRECORDED)
- goto out;
- else
- /* release xtpage for cmRead()/xtLookup() */
- XT_PUTPAGE(pmp);
-
- /*
- * cmRelocate()
- *
- * copy target data pages to be relocated;
- *
- * data extent must start at page boundary and
- * multiple of page size (except the last data extent);
- * read in each page of the source data extent into cbuf,
- * update the cbuf extent descriptor of the page to be
- * homeward bound to new dst data extent
- * copy the data from the old extent to new extent.
- * copy is essential for compressed files to avoid problems
- * that can arise if there was a change in compression
- * algorithms.
- * it is a good strategy because it may disrupt cache
- * policy to keep the pages in memory afterwards.
- */
- offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
- assert((offset & CM_OFFSET) == 0);
- nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
- pno = offset >> CM_L2BSIZE;
- npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
-/*
- npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
- (offset >> CM_L2BSIZE) + 1;
-*/
- sxaddr = oxaddr;
- dxaddr = nxaddr;
-
- /* process the request one cache buffer at a time */
- for (nbrd = 0; nbrd < nbytes; nbrd += nb,
- offset += nb, pno++, npages--) {
- /* compute page size */
- nb = min(nbytes - nbrd, CM_BSIZE);
-
- /* get the cache buffer of the page */
- if (rc = cmRead(ip, offset, npages, &cp))
- break;
-
- assert(addressPXD(&cp->cm_pxd) == sxaddr);
- assert(!cp->cm_modified);
-
- /* bind buffer with the new extent address */
- nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
- cmSetXD(ip, cp, pno, dxaddr, nblks);
-
- /* release the cbuf, mark it as modified */
- cmPut(cp, true);
-
- dxaddr += nblks;
- sxaddr += nblks;
- }
-
- /* get back parent page */
- if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0)))
- return rc;
-
- XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
- jfs_info("xtRelocate: target data extent relocated.");
- } else { /* (xtype == XTPAGE) */
-
- /*
- * read in the target xtpage from the source extent;
- */
- XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
- if (rc) {
- XT_PUTPAGE(pmp);
- return rc;
- }
-
- /*
- * read in sibling pages if any to update sibling pointers;
- */
- rmp = NULL;
- if (p->header.next) {
- nextbn = le64_to_cpu(p->header.next);
- XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
- if (rc) {
- XT_PUTPAGE(pmp);
- XT_PUTPAGE(mp);
- return (rc);
- }
- }
-
- lmp = NULL;
- if (p->header.prev) {
- prevbn = le64_to_cpu(p->header.prev);
- XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
- if (rc) {
- XT_PUTPAGE(pmp);
- XT_PUTPAGE(mp);
- if (rmp)
- XT_PUTPAGE(rmp);
- return (rc);
- }
- }
-
- /* at this point, all xtpages to be updated are in memory */
-
- /*
- * update sibling pointers of sibling xtpages if any;
- */
- if (lmp) {
- BT_MARK_DIRTY(lmp, ip);
- tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
- lp->header.next = cpu_to_le64(nxaddr);
- XT_PUTPAGE(lmp);
- }
-
- if (rmp) {
- BT_MARK_DIRTY(rmp, ip);
- tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
- rp->header.prev = cpu_to_le64(nxaddr);
- XT_PUTPAGE(rmp);
- }
-
- /*
- * update the target xtpage to be relocated
- *
- * update the self address of the target page
- * and write to destination extent;
- * redo image covers the whole xtpage since it is new page
- * to the destination extent;
- * update of bmap for the free of source extent
- * of the target xtpage itself:
- * update of bmap for the allocation of destination extent
- * of the target xtpage itself:
- * update of bmap for the extents covered by xad entries in
- * the target xtpage is not necessary since they are not
- * updated;
- * if not committed before this relocation,
- * target page may contain XAD_NEW entries which must
- * be scanned for bmap update (logredo() always
- * scan xtpage REDOPAGE image for bmap update);
- * if committed before this relocation (tlckRELOCATE),
- * scan may be skipped by commit() and logredo();
- */
- BT_MARK_DIRTY(mp, ip);
- /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
- xtlck = (struct xtlock *) & tlck->lock;
-
- /* update the self address in the xtpage header */
- pxd = &p->header.self;
- PXDaddress(pxd, nxaddr);
-
- /* linelock for the after image of the whole page */
- xtlck->lwm.length =
- le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
-
- /* update the buffer extent descriptor of target xtpage */
- xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
- bmSetXD(mp, nxaddr, xsize);
-
- /* unpin the target page to new homeward bound */
- XT_PUTPAGE(mp);
- jfs_info("xtRelocate: target xtpage relocated.");
- }
-
- /*
- * 3. acquire maplock for the source extent to be freed;
- *
- * acquire a maplock saving the src relocated extent address;
- * to free of the extent at commit time;
- */
- out:
- /* if DATAEXT relocation, write a LOG_UPDATEMAP record for
- * free PXD of the source data extent (logredo() will update
- * bmap for free of source data extent), and update bmap for
- * free of the source data extent;
- */
- if (xtype == DATAEXT)
- tlck = txMaplock(tid, ip, tlckMAP);
- /* if XTPAGE relocation, write a LOG_NOREDOPAGE record
- * for the source xtpage (logredo() will init NoRedoPage
- * filter and will also update bmap for free of the source
- * xtpage), and update bmap for free of the source xtpage;
- * N.B. We use tlckMAP instead of tlkcXTREE because there
- * is no buffer associated with this lock since the buffer
- * has been redirected to the target location.
- */
- else /* (xtype == XTPAGE) */
- tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
-
- pxdlock = (struct pxd_lock *) & tlck->lock;
- pxdlock->flag = mlckFREEPXD;
- PXDaddress(&pxdlock->pxd, oxaddr);
- PXDlength(&pxdlock->pxd, xlen);
- pxdlock->index = 1;
-
- /*
- * 4. update the parent xad entry for relocation;
- *
- * acquire tlck for the parent entry with XAD_NEW as entry
- * update which will write LOG_REDOPAGE and update bmap for
- * allocation of XAD_NEW destination extent;
- */
- jfs_info("xtRelocate: update parent xad entry.");
- BT_MARK_DIRTY(pmp, ip);
- tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
- xtlck = (struct xtlock *) & tlck->lock;
-
- /* update the XAD with the new destination extent; */
- xad = &pp->xad[index];
- xad->flag |= XAD_NEW;
- XADaddress(xad, nxaddr);
-
- xtlck->lwm.offset = min(index, xtlck->lwm.offset);
- xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
- xtlck->lwm.offset;
-
- /* unpin the parent xtpage */
- XT_PUTPAGE(pmp);
-
- return rc;
-}
-
-
-/*
- * xtSearchNode()
- *
- * function: search for the internal xad entry covering specified extent.
- * This function is mainly used by defragfs utility.
- *
- * parameters:
- * ip - file object;
- * xad - extent to find;
- * cmpp - comparison result:
- * btstack - traverse stack;
- * flag - search process flag;
- *
- * returns:
- * btstack contains (bn, index) of search path traversed to the entry.
- * *cmpp is set to result of comparison with the entry returned.
- * the page containing the entry is pinned at exit.
- */
-static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
- int *cmpp, struct btstack * btstack, int flag)
-{
- int rc = 0;
- s64 xoff, xaddr;
- int xlen;
- int cmp = 1; /* init for empty page */
- s64 bn; /* block number */
- struct metapage *mp; /* meta-page buffer */
- xtpage_t *p; /* page */
- int base, index, lim;
- struct btframe *btsp;
- s64 t64;
-
- BT_CLR(btstack);
-
- xoff = offsetXAD(xad);
- xlen = lengthXAD(xad);
- xaddr = addressXAD(xad);
-
- /*
- * search down tree from root:
- *
- * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
- * internal page, child page Pi contains entry with k, Ki <= K < Kj.
- *
- * if entry with search key K is not found
- * internal page search find the entry with largest key Ki
- * less than K which point to the child page to search;
- * leaf page search find the entry with smallest key Kj
- * greater than K so that the returned index is the position of
- * the entry to be shifted right for insertion of new entry.
- * for empty tree, search key is greater than any key of the tree.
- *
- * by convention, root bn = 0.
- */
- for (bn = 0;;) {
- /* get/pin the page to search */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
- if (p->header.flag & BT_LEAF) {
- XT_PUTPAGE(mp);
- return -ESTALE;
- }
-
- lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
-
- /*
- * binary search with search key K on the current page
- */
- for (base = XTENTRYSTART; lim; lim >>= 1) {
- index = base + (lim >> 1);
-
- XT_CMP(cmp, xoff, &p->xad[index], t64);
- if (cmp == 0) {
- /*
- * search hit
- *
- * verify for exact match;
- */
- if (xaddr == addressXAD(&p->xad[index]) &&
- xoff == offsetXAD(&p->xad[index])) {
- *cmpp = cmp;
-
- /* save search result */
- btsp = btstack->top;
- btsp->bn = bn;
- btsp->index = index;
- btsp->mp = mp;
-
- return 0;
- }
-
- /* descend/search its child page */
- goto next;
- }
-
- if (cmp > 0) {
- base = index + 1;
- --lim;
- }
- }
-
- /*
- * search miss - non-leaf page:
- *
- * base is the smallest index with key (Kj) greater than
- * search key (K) and may be zero or maxentry index.
- * if base is non-zero, decrement base by one to get the parent
- * entry of the child page to search.
- */
- index = base ? base - 1 : base;
-
- /*
- * go down to child page
- */
- next:
- /* get the child page block number */
- bn = addressXAD(&p->xad[index]);
-
- /* unpin the parent page */
- XT_PUTPAGE(mp);
- }
-}
-
-
-/*
- * xtRelink()
- *
- * function:
- * link around a freed page.
- *
- * Parameter:
- * int tid,
- * struct inode *ip,
- * xtpage_t *p)
- *
- * returns:
- */
-static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
-{
- int rc = 0;
- struct metapage *mp;
- s64 nextbn, prevbn;
- struct tlock *tlck;
-
- nextbn = le64_to_cpu(p->header.next);
- prevbn = le64_to_cpu(p->header.prev);
-
- /* update prev pointer of the next page */
- if (nextbn != 0) {
- XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- /*
- * acquire a transaction lock on the page;
- *
- * action: update prev pointer;
- */
- BT_MARK_DIRTY(mp, ip);
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
-
- /* the page may already have been tlock'd */
-
- p->header.prev = cpu_to_le64(prevbn);
-
- XT_PUTPAGE(mp);
- }
-
- /* update next pointer of the previous page */
- if (prevbn != 0) {
- XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
-
- /*
- * acquire a transaction lock on the page;
- *
- * action: update next pointer;
- */
- BT_MARK_DIRTY(mp, ip);
- tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
-
- /* the page may already have been tlock'd */
-
- p->header.next = le64_to_cpu(nextbn);
-
- XT_PUTPAGE(mp);
- }
-
- return 0;
-}
-#endif /* _STILL_TO_PORT */
-
/*
* xtInitRoot()
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 5f51be8596b3..142caafc73b1 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -95,10 +95,6 @@ extern int xtInsert(tid_t tid, struct inode *ip,
int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
int flag);
-#ifdef _NOTYET
-extern int xtTailgate(tid_t tid, struct inode *ip,
- s64 xoff, int xlen, s64 xaddr, int flag);
-#endif
extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
int flag);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index f1a13a74cddf..85d4f44f2ac4 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -372,19 +372,16 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
}
case Opt_discard:
- {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
/* if set to 1, even copying files will cause
* trimming :O
* -> user has more control over the online trimming
*/
sbi->minblks_trim = 64;
- if (blk_queue_discard(q))
+ if (bdev_max_discard_sectors(sb->s_bdev))
*flag |= JFS_DISCARD;
else
pr_err("JFS: discard option not supported on device\n");
break;
- }
case Opt_nodiscard:
*flag &= ~JFS_DISCARD;
@@ -392,10 +389,9 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_discard_minblk:
{
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
char *minblks_trim = args[0].from;
int rc;
- if (blk_queue_discard(q)) {
+ if (bdev_max_discard_sectors(sb->s_bdev)) {
*flag |= JFS_DISCARD;
rc = kstrtouint(minblks_trim, 0,
&sbi->minblks_trim);
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 1b07550485b9..5d826274570c 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -29,15 +29,15 @@
* change between calls to kernel_read_file().
*
* Returns number of bytes read (no single read will be bigger
- * than INT_MAX), or negative on error.
+ * than SSIZE_MAX), or negative on error.
*
*/
-int kernel_read_file(struct file *file, loff_t offset, void **buf,
- size_t buf_size, size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file(struct file *file, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
{
loff_t i_size, pos;
- size_t copied;
+ ssize_t copied;
void *allocated = NULL;
bool whole_file;
int ret;
@@ -58,7 +58,7 @@ int kernel_read_file(struct file *file, loff_t offset, void **buf,
goto out;
}
/* The file is too big for sane activities. */
- if (i_size > INT_MAX) {
+ if (i_size > SSIZE_MAX) {
ret = -EFBIG;
goto out;
}
@@ -124,12 +124,12 @@ out:
}
EXPORT_SYMBOL_GPL(kernel_read_file);
-int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
- size_t buf_size, size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
{
struct file *file;
- int ret;
+ ssize_t ret;
if (!path || !*path)
return -EINVAL;
@@ -144,14 +144,14 @@ int kernel_read_file_from_path(const char *path, loff_t offset, void **buf,
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
-int kernel_read_file_from_path_initns(const char *path, loff_t offset,
- void **buf, size_t buf_size,
- size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_path_initns(const char *path, loff_t offset,
+ void **buf, size_t buf_size,
+ size_t *file_size,
+ enum kernel_read_file_id id)
{
struct file *file;
struct path root;
- int ret;
+ ssize_t ret;
if (!path || !*path)
return -EINVAL;
@@ -171,12 +171,12 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset,
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
-int kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
- size_t buf_size, size_t *file_size,
- enum kernel_read_file_id id)
+ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
+ size_t buf_size, size_t *file_size,
+ enum kernel_read_file_id id)
{
struct fd f = fdget(fd);
- int ret = -EBADF;
+ ssize_t ret = -EBADF;
if (!f.file || !(f.file->f_mode & FMODE_READ))
goto out;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 61a8edc4ba8b..3990f3e270cb 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -18,7 +18,15 @@
#include "kernfs-internal.h"
static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
-static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
+/*
+ * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
+ * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
+ * will perform wakeups when releasing console_sem. Holding rename_lock
+ * will introduce deadlock if the scheduler reads the kernfs_name in the
+ * wakeup path.
+ */
+static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
+static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -229,12 +237,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
{
unsigned long flags;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+ kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
pr_cont("%s", kernfs_pr_cont_buf);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -248,10 +256,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
unsigned long flags;
int sz;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
pr_cont("(error)");
goto out;
@@ -265,7 +273,7 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
pr_cont("%s", kernfs_pr_cont_buf);
out:
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -464,6 +472,16 @@ static void kernfs_drain(struct kernfs_node *kn)
lockdep_assert_held_write(&root->kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn));
+ /*
+ * Skip draining if already fully drained. This avoids draining and its
+ * lockdep annotations for nodes which have never been activated
+ * allowing embedding kernfs_remove() in create error paths without
+ * worrying about draining.
+ */
+ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS &&
+ !kernfs_should_drain_open_files(kn))
+ return;
+
up_write(&root->kernfs_rwsem);
if (kernfs_lockdep(kn)) {
@@ -472,7 +490,6 @@ static void kernfs_drain(struct kernfs_node *kn)
lock_contended(&kn->dep_map, _RET_IP_);
}
- /* but everyone should wait for draining */
wait_event(root->deactivate_waitq,
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
@@ -481,7 +498,8 @@ static void kernfs_drain(struct kernfs_node *kn)
rwsem_release(&kn->dep_map, _RET_IP_);
}
- kernfs_drain_open_files(kn);
+ if (kernfs_should_drain_open_files(kn))
+ kernfs_drain_open_files(kn);
down_write(&root->kernfs_rwsem);
}
@@ -687,13 +705,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
goto err_unlock;
}
- /*
- * ACTIVATED is protected with kernfs_mutex but it was clear when
- * @kn was added to idr and we just wanna see it set. No need to
- * grab kernfs_mutex.
- */
- if (unlikely(!(kn->flags & KERNFS_ACTIVATED) ||
- !atomic_inc_not_zero(&kn->count)))
+ if (unlikely(!kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
goto err_unlock;
spin_unlock(&kernfs_idr_lock);
@@ -735,10 +747,7 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
ret = -ENOENT;
- if (parent->flags & KERNFS_EMPTY_DIR)
- goto out_unlock;
-
- if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
+ if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR))
goto out_unlock;
kn->hash = kernfs_name_hash(kn->name, kn->ns);
@@ -823,13 +832,12 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
- /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
- spin_lock_irq(&kernfs_rename_lock);
+ spin_lock_irq(&kernfs_pr_cont_lock);
len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
if (len >= sizeof(kernfs_pr_cont_buf)) {
- spin_unlock_irq(&kernfs_rename_lock);
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return NULL;
}
@@ -841,7 +849,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
parent = kernfs_find_ns(parent, name, ns);
}
- spin_unlock_irq(&kernfs_rename_lock);
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return parent;
}
@@ -1297,6 +1305,21 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
return pos->parent;
}
+static void kernfs_activate_one(struct kernfs_node *kn)
+{
+ lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
+
+ kn->flags |= KERNFS_ACTIVATED;
+
+ if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
+ return;
+
+ WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb));
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
+}
+
/**
* kernfs_activate - activate a node which started deactivated
* @kn: kernfs_node whose subtree is to be activated
@@ -1318,15 +1341,42 @@ void kernfs_activate(struct kernfs_node *kn)
down_write(&root->kernfs_rwsem);
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn))) {
- if (pos->flags & KERNFS_ACTIVATED)
- continue;
+ while ((pos = kernfs_next_descendant_post(pos, kn)))
+ kernfs_activate_one(pos);
- WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
- WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
+ up_write(&root->kernfs_rwsem);
+}
- atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
- pos->flags |= KERNFS_ACTIVATED;
+/**
+ * kernfs_show - show or hide a node
+ * @kn: kernfs_node to show or hide
+ * @show: whether to show or hide
+ *
+ * If @show is %false, @kn is marked hidden and deactivated. A hidden node is
+ * ignored in future activaitons. If %true, the mark is removed and activation
+ * state is restored. This function won't implicitly activate a new node in a
+ * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet.
+ *
+ * To avoid recursion complexities, directories aren't supported for now.
+ */
+void kernfs_show(struct kernfs_node *kn, bool show)
+{
+ struct kernfs_root *root = kernfs_root(kn);
+
+ if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR))
+ return;
+
+ down_write(&root->kernfs_rwsem);
+
+ if (show) {
+ kn->flags &= ~KERNFS_HIDDEN;
+ if (kn->flags & KERNFS_ACTIVATED)
+ kernfs_activate_one(kn);
+ } else {
+ kn->flags |= KERNFS_HIDDEN;
+ if (kernfs_active(kn))
+ atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
+ kernfs_drain(kn);
}
up_write(&root->kernfs_rwsem);
@@ -1336,46 +1386,42 @@ static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos;
+ /* Short-circuit if non-root @kn has already finished removal. */
+ if (!kn)
+ return;
+
lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
/*
- * Short-circuit if non-root @kn has already finished removal.
* This is for kernfs_remove_self() which plays with active ref
* after removal.
*/
- if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
+ if (kn->parent && RB_EMPTY_NODE(&kn->rb))
return;
pr_debug("kernfs %s: removing\n", kn->name);
- /* prevent any new usage under @kn by deactivating all nodes */
+ /* prevent new usage by marking all nodes removing and deactivating */
pos = NULL;
- while ((pos = kernfs_next_descendant_post(pos, kn)))
+ while ((pos = kernfs_next_descendant_post(pos, kn))) {
+ pos->flags |= KERNFS_REMOVING;
if (kernfs_active(pos))
atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+ }
/* deactivate and unlink the subtree node-by-node */
do {
pos = kernfs_leftmost_descendant(kn);
/*
- * kernfs_drain() drops kernfs_rwsem temporarily and @pos's
+ * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's
* base ref could have been put by someone else by the time
* the function returns. Make sure it doesn't go away
* underneath us.
*/
kernfs_get(pos);
- /*
- * Drain iff @kn was activated. This avoids draining and
- * its lockdep annotations for nodes which have never been
- * activated and allows embedding kernfs_remove() in create
- * error paths without worrying about draining.
- */
- if (kn->flags & KERNFS_ACTIVATED)
- kernfs_drain(pos);
- else
- WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+ kernfs_drain(pos);
/*
* kernfs_unlink_sibling() succeeds once per node. Use it
@@ -1406,7 +1452,12 @@ static void __kernfs_remove(struct kernfs_node *kn)
*/
void kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_root *root = kernfs_root(kn);
+ struct kernfs_root *root;
+
+ if (!kn)
+ return;
+
+ root = kernfs_root(kn);
down_write(&root->kernfs_rwsem);
__kernfs_remove(kn);
@@ -1570,8 +1621,11 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
down_write(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
- if (kn)
+ if (kn) {
+ kernfs_get(kn);
__kernfs_remove(kn);
+ kernfs_put(kn);
+ }
up_write(&root->kernfs_rwsem);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 88423069407c..9ab6c92e02da 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -18,25 +18,13 @@
#include "kernfs-internal.h"
-/*
- * There's one kernfs_open_file for each open file and one kernfs_open_node
- * for each kernfs_node with one or more open files.
- *
- * kernfs_node->attr.open points to kernfs_open_node. attr.open is
- * protected by kernfs_open_node_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * kernfs_open_file. kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(kernfs_open_node_lock);
-static DEFINE_MUTEX(kernfs_open_file_mutex);
-
struct kernfs_open_node {
- atomic_t refcnt;
+ struct rcu_head rcu_head;
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
+ unsigned int nr_mmapped;
+ unsigned int nr_to_release;
};
/*
@@ -52,6 +40,56 @@ struct kernfs_open_node {
static DEFINE_SPINLOCK(kernfs_notify_lock);
static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
+{
+ int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ return &kernfs_locks->open_file_mutex[idx];
+}
+
+static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
+{
+ struct mutex *lock;
+
+ lock = kernfs_open_file_mutex_ptr(kn);
+
+ mutex_lock(lock);
+
+ return lock;
+}
+
+/**
+ * of_on - Return the kernfs_open_node of the specified kernfs_open_file
+ * @of: taret kernfs_open_file
+ */
+static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
+{
+ return rcu_dereference_protected(of->kn->attr.open,
+ !list_empty(&of->list));
+}
+
+/**
+ * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
+ *
+ * @kn: target kernfs_node.
+ *
+ * Fetch and return ->attr.open of @kn when caller holds the
+ * kernfs_open_file_mutex_ptr(kn).
+ *
+ * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when
+ * the caller guarantees that this mutex is being held, other updaters can't
+ * change ->attr.open and this means that we can safely deref ->attr.open
+ * outside RCU read-side critical section.
+ *
+ * The caller needs to make sure that kernfs_open_file_mutex is held.
+ */
+static struct kernfs_open_node *
+kernfs_deref_open_node_locked(struct kernfs_node *kn)
+{
+ return rcu_dereference_protected(kn->attr.open,
+ lockdep_is_held(kernfs_open_file_mutex_ptr(kn)));
+}
+
static struct kernfs_open_file *kernfs_of(struct file *file)
{
return ((struct seq_file *)file->private_data)->private;
@@ -158,7 +196,7 @@ static int kernfs_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
- of->event = atomic_read(&of->kn->attr.open->event);
+ of->event = atomic_read(&of_on(of)->event);
return of->kn->attr.ops->seq_show(sf, v);
}
@@ -202,7 +240,8 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out_free;
}
- of->event = atomic_read(&of->kn->attr.open->event);
+ of->event = atomic_read(&of_on(of)->event);
+
ops = kernfs_ops(of->kn);
if (ops->read)
len = ops->read(of, buf, len, iocb->ki_pos);
@@ -244,7 +283,7 @@ static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* There is no easy way for us to know if userspace is only doing a partial
* write, so we don't support them. We expect the entire buffer to come on
* the first write. Hint: if you're writing a value, first read the file,
- * modify only the the value you're changing, then write entire buffer
+ * modify only the value you're changing, then write entire buffer
* back.
*/
static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -485,12 +524,12 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
* It is not possible to successfully wrap close.
* So error if someone is trying to use close.
*/
- rc = -EINVAL;
if (vma->vm_ops && vma->vm_ops->close)
goto out_put;
rc = 0;
of->mmapped = true;
+ of_on(of)->nr_mmapped++;
of->vm_ops = vma->vm_ops;
vma->vm_ops = &kernfs_vm_ops;
out_put:
@@ -518,75 +557,79 @@ out_unlock:
static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
- struct kernfs_open_node *on, *new_on = NULL;
-
- retry:
- mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irq(&kernfs_open_node_lock);
-
- if (!kn->attr.open && new_on) {
- kn->attr.open = new_on;
- new_on = NULL;
- }
-
- on = kn->attr.open;
- if (on) {
- atomic_inc(&on->refcnt);
- list_add_tail(&of->list, &on->files);
- }
+ struct kernfs_open_node *on;
+ struct mutex *mutex;
- spin_unlock_irq(&kernfs_open_node_lock);
- mutex_unlock(&kernfs_open_file_mutex);
+ mutex = kernfs_open_file_mutex_lock(kn);
+ on = kernfs_deref_open_node_locked(kn);
- if (on) {
- kfree(new_on);
- return 0;
+ if (!on) {
+ /* not there, initialize a new one */
+ on = kzalloc(sizeof(*on), GFP_KERNEL);
+ if (!on) {
+ mutex_unlock(mutex);
+ return -ENOMEM;
+ }
+ atomic_set(&on->event, 1);
+ init_waitqueue_head(&on->poll);
+ INIT_LIST_HEAD(&on->files);
+ rcu_assign_pointer(kn->attr.open, on);
}
- /* not there, initialize a new one and retry */
- new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
- if (!new_on)
- return -ENOMEM;
+ list_add_tail(&of->list, &on->files);
+ if (kn->flags & KERNFS_HAS_RELEASE)
+ on->nr_to_release++;
- atomic_set(&new_on->refcnt, 0);
- atomic_set(&new_on->event, 1);
- init_waitqueue_head(&new_on->poll);
- INIT_LIST_HEAD(&new_on->files);
- goto retry;
+ mutex_unlock(mutex);
+ return 0;
}
/**
- * kernfs_put_open_node - put kernfs_open_node
- * @kn: target kernfs_nodet
+ * kernfs_unlink_open_file - Unlink @of from @kn.
+ *
+ * @kn: target kernfs_node
* @of: associated kernfs_open_file
+ * @open_failed: ->open() failed, cancel ->release()
*
- * Put @kn->attr.open and unlink @of from the files list. If
- * reference count reaches zero, disassociate and free it.
+ * Unlink @of from list of @kn's associated open files. If list of
+ * associated open files becomes empty, disassociate and free
+ * kernfs_open_node.
*
* LOCKING:
* None.
*/
-static void kernfs_put_open_node(struct kernfs_node *kn,
- struct kernfs_open_file *of)
+static void kernfs_unlink_open_file(struct kernfs_node *kn,
+ struct kernfs_open_file *of,
+ bool open_failed)
{
- struct kernfs_open_node *on = kn->attr.open;
- unsigned long flags;
+ struct kernfs_open_node *on;
+ struct mutex *mutex;
- mutex_lock(&kernfs_open_file_mutex);
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
+ mutex = kernfs_open_file_mutex_lock(kn);
- if (of)
- list_del(&of->list);
+ on = kernfs_deref_open_node_locked(kn);
+ if (!on) {
+ mutex_unlock(mutex);
+ return;
+ }
- if (atomic_dec_and_test(&on->refcnt))
- kn->attr.open = NULL;
- else
- on = NULL;
+ if (of) {
+ if (kn->flags & KERNFS_HAS_RELEASE) {
+ WARN_ON_ONCE(of->released == open_failed);
+ if (open_failed)
+ on->nr_to_release--;
+ }
+ if (of->mmapped)
+ on->nr_mmapped--;
+ list_del(&of->list);
+ }
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
- mutex_unlock(&kernfs_open_file_mutex);
+ if (list_empty(&on->files)) {
+ rcu_assign_pointer(kn->attr.open, NULL);
+ kfree_rcu(on, rcu_head);
+ }
- kfree(on);
+ mutex_unlock(mutex);
}
static int kernfs_fop_open(struct inode *inode, struct file *file)
@@ -706,7 +749,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
return 0;
err_put_node:
- kernfs_put_open_node(kn, of);
+ kernfs_unlink_open_file(kn, of, true);
err_seq_release:
seq_release(inode, file);
err_free:
@@ -724,11 +767,11 @@ static void kernfs_release_file(struct kernfs_node *kn,
/*
* @of is guaranteed to have no other file operations in flight and
* we just want to synchronize release and drain paths.
- * @kernfs_open_file_mutex is enough. @of->mutex can't be used
+ * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used
* here because drain path may be called from places which can
* cause circular dependency.
*/
- lockdep_assert_held(&kernfs_open_file_mutex);
+ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn));
if (!of->released) {
/*
@@ -738,6 +781,7 @@ static void kernfs_release_file(struct kernfs_node *kn,
*/
kn->attr.ops->release(of);
of->released = true;
+ of_on(of)->nr_to_release--;
}
}
@@ -747,12 +791,14 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
struct kernfs_open_file *of = kernfs_of(filp);
if (kn->flags & KERNFS_HAS_RELEASE) {
- mutex_lock(&kernfs_open_file_mutex);
+ struct mutex *mutex;
+
+ mutex = kernfs_open_file_mutex_lock(kn);
kernfs_release_file(kn, of);
- mutex_unlock(&kernfs_open_file_mutex);
+ mutex_unlock(mutex);
}
- kernfs_put_open_node(kn, of);
+ kernfs_unlink_open_file(kn, of, false);
seq_release(inode, filp);
kfree(of->prealloc_buf);
kfree(of);
@@ -760,37 +806,53 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp)
return 0;
}
+bool kernfs_should_drain_open_files(struct kernfs_node *kn)
+{
+ struct kernfs_open_node *on;
+ bool ret;
+
+ /*
+ * @kn being deactivated guarantees that @kn->attr.open can't change
+ * beneath us making the lockless test below safe.
+ */
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ rcu_read_lock();
+ on = rcu_dereference(kn->attr.open);
+ ret = on && (on->nr_mmapped || on->nr_to_release);
+ rcu_read_unlock();
+
+ return ret;
+}
+
void kernfs_drain_open_files(struct kernfs_node *kn)
{
struct kernfs_open_node *on;
struct kernfs_open_file *of;
+ struct mutex *mutex;
- if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
- return;
-
- spin_lock_irq(&kernfs_open_node_lock);
- on = kn->attr.open;
- if (on)
- atomic_inc(&on->refcnt);
- spin_unlock_irq(&kernfs_open_node_lock);
- if (!on)
+ mutex = kernfs_open_file_mutex_lock(kn);
+ on = kernfs_deref_open_node_locked(kn);
+ if (!on) {
+ mutex_unlock(mutex);
return;
-
- mutex_lock(&kernfs_open_file_mutex);
+ }
list_for_each_entry(of, &on->files, list) {
struct inode *inode = file_inode(of->file);
- if (kn->flags & KERNFS_HAS_MMAP)
+ if (of->mmapped) {
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+ of->mmapped = false;
+ on->nr_mmapped--;
+ }
if (kn->flags & KERNFS_HAS_RELEASE)
kernfs_release_file(kn, of);
}
- mutex_unlock(&kernfs_open_file_mutex);
-
- kernfs_put_open_node(kn, NULL);
+ WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
+ mutex_unlock(mutex);
}
/*
@@ -809,8 +871,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
*/
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
- struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ struct kernfs_open_node *on = of_on(of);
poll_wait(of->file, &on->poll, wait);
@@ -917,13 +978,13 @@ void kernfs_notify(struct kernfs_node *kn)
return;
/* kick poll immediately */
- spin_lock_irqsave(&kernfs_open_node_lock, flags);
- on = kn->attr.open;
+ rcu_read_lock();
+ on = rcu_dereference(kn->attr.open);
if (on) {
atomic_inc(&on->event);
wake_up_interruptible(&on->poll);
}
- spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+ rcu_read_unlock();
/* schedule work to kick fsnotify */
spin_lock_irqsave(&kernfs_notify_lock, flags);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eeaa779b929c..fc5821effd97 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -157,6 +157,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
*/
extern const struct file_operations kernfs_file_fops;
+bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);
/*
@@ -164,4 +165,8 @@ void kernfs_drain_open_files(struct kernfs_node *kn);
*/
extern const struct inode_operations kernfs_symlink_iops;
+/*
+ * kernfs locks
+ */
+extern struct kernfs_global_locks *kernfs_locks;
#endif /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index cfa79715fc1a..d0859f72d2d6 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -20,6 +20,7 @@
#include "kernfs-internal.h"
struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
+struct kernfs_global_locks *kernfs_locks;
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
@@ -387,6 +388,22 @@ void kernfs_kill_sb(struct super_block *sb)
kfree(info);
}
+static void __init kernfs_mutex_init(void)
+{
+ int count;
+
+ for (count = 0; count < NR_KERNFS_LOCKS; count++)
+ mutex_init(&kernfs_locks->open_file_mutex[count]);
+}
+
+static void __init kernfs_lock_init(void)
+{
+ kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL);
+ WARN_ON(!kernfs_locks);
+
+ kernfs_mutex_init();
+}
+
void __init kernfs_init(void)
{
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
@@ -397,4 +414,6 @@ void __init kernfs_init(void)
kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
sizeof(struct kernfs_iattrs),
0, SLAB_PANIC, NULL);
+
+ kernfs_lock_init();
}
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index 911444d21267..2a39ffb8423b 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -121,8 +121,8 @@ out:
return rc;
}
-static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash,
- char *dname)
+static int calc_ntlmv2_hash(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ char *ntlmv2_hash, char *dname)
{
int ret, len, conv_len;
wchar_t *domain = NULL;
@@ -158,7 +158,7 @@ static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash,
}
conv_len = smb_strtoUTF16(uniname, user_name(sess->user), len,
- sess->conn->local_nls);
+ conn->local_nls);
if (conv_len < 0 || conv_len > len) {
ret = -EINVAL;
goto out;
@@ -182,7 +182,7 @@ static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash,
}
conv_len = smb_strtoUTF16((__le16 *)domain, dname, len,
- sess->conn->local_nls);
+ conn->local_nls);
if (conv_len < 0 || conv_len > len) {
ret = -EINVAL;
goto out;
@@ -215,8 +215,9 @@ out:
*
* Return: 0 on success, error number on error
*/
-int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
- int blen, char *domain_name, char *cryptkey)
+int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ struct ntlmv2_resp *ntlmv2, int blen, char *domain_name,
+ char *cryptkey)
{
char ntlmv2_hash[CIFS_ENCPWD_SIZE];
char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE];
@@ -230,7 +231,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
return -ENOMEM;
}
- rc = calc_ntlmv2_hash(sess, ntlmv2_hash, domain_name);
+ rc = calc_ntlmv2_hash(conn, sess, ntlmv2_hash, domain_name);
if (rc) {
ksmbd_debug(AUTH, "could not get v2 hash rc %d\n", rc);
goto out;
@@ -333,7 +334,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
/* process NTLMv2 authentication */
ksmbd_debug(AUTH, "decode_ntlmssp_authenticate_blob dname%s\n",
domain_name);
- ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off),
+ ret = ksmbd_auth_ntlmv2(conn, sess,
+ (struct ntlmv2_resp *)((char *)authblob + nt_off),
nt_len - CIFS_ENCPWD_SIZE,
domain_name, conn->ntlmssp.cryptkey);
kfree(domain_name);
@@ -422,6 +424,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
NTLMSSP_NEGOTIATE_56);
}
+ if (cflags & NTLMSSP_NEGOTIATE_SEAL && smb3_encryption_negotiated(conn))
+ flags |= NTLMSSP_NEGOTIATE_SEAL;
+
if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN)
flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
@@ -659,8 +664,9 @@ struct derivation {
bool binding;
};
-static int generate_key(struct ksmbd_session *sess, struct kvec label,
- struct kvec context, __u8 *key, unsigned int key_size)
+static int generate_key(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ struct kvec label, struct kvec context, __u8 *key,
+ unsigned int key_size)
{
unsigned char zero = 0x0;
__u8 i[4] = {0, 0, 0, 1};
@@ -720,8 +726,8 @@ static int generate_key(struct ksmbd_session *sess, struct kvec label,
goto smb3signkey_ret;
}
- if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
- sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L256, 4);
else
rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L128, 4);
@@ -756,17 +762,17 @@ static int generate_smb3signingkey(struct ksmbd_session *sess,
if (!chann)
return 0;
- if (sess->conn->dialect >= SMB30_PROT_ID && signing->binding)
+ if (conn->dialect >= SMB30_PROT_ID && signing->binding)
key = chann->smb3signingkey;
else
key = sess->smb3signingkey;
- rc = generate_key(sess, signing->label, signing->context, key,
+ rc = generate_key(conn, sess, signing->label, signing->context, key,
SMB3_SIGN_KEY_SIZE);
if (rc)
return rc;
- if (!(sess->conn->dialect >= SMB30_PROT_ID && signing->binding))
+ if (!(conn->dialect >= SMB30_PROT_ID && signing->binding))
memcpy(chann->smb3signingkey, key, SMB3_SIGN_KEY_SIZE);
ksmbd_debug(AUTH, "dumping generated AES signing keys\n");
@@ -820,30 +826,31 @@ struct derivation_twin {
struct derivation decryption;
};
-static int generate_smb3encryptionkey(struct ksmbd_session *sess,
+static int generate_smb3encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess,
const struct derivation_twin *ptwin)
{
int rc;
- rc = generate_key(sess, ptwin->encryption.label,
+ rc = generate_key(conn, sess, ptwin->encryption.label,
ptwin->encryption.context, sess->smb3encryptionkey,
SMB3_ENC_DEC_KEY_SIZE);
if (rc)
return rc;
- rc = generate_key(sess, ptwin->decryption.label,
+ rc = generate_key(conn, sess, ptwin->decryption.label,
ptwin->decryption.context,
sess->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE);
if (rc)
return rc;
ksmbd_debug(AUTH, "dumping generated AES encryption keys\n");
- ksmbd_debug(AUTH, "Cipher type %d\n", sess->conn->cipher_type);
+ ksmbd_debug(AUTH, "Cipher type %d\n", conn->cipher_type);
ksmbd_debug(AUTH, "Session Id %llu\n", sess->id);
ksmbd_debug(AUTH, "Session Key %*ph\n",
SMB2_NTLMV2_SESSKEY_SIZE, sess->sess_key);
- if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
- sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) {
+ if (conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM ||
+ conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) {
ksmbd_debug(AUTH, "ServerIn Key %*ph\n",
SMB3_GCM256_CRYPTKEY_SIZE, sess->smb3encryptionkey);
ksmbd_debug(AUTH, "ServerOut Key %*ph\n",
@@ -857,7 +864,8 @@ static int generate_smb3encryptionkey(struct ksmbd_session *sess,
return 0;
}
-int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess)
+int ksmbd_gen_smb30_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
{
struct derivation_twin twin;
struct derivation *d;
@@ -874,10 +882,11 @@ int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess)
d->context.iov_base = "ServerIn ";
d->context.iov_len = 10;
- return generate_smb3encryptionkey(sess, &twin);
+ return generate_smb3encryptionkey(conn, sess, &twin);
}
-int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess)
+int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
{
struct derivation_twin twin;
struct derivation *d;
@@ -894,7 +903,7 @@ int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess)
d->context.iov_base = sess->Preauth_HashValue;
d->context.iov_len = 64;
- return generate_smb3encryptionkey(sess, &twin);
+ return generate_smb3encryptionkey(conn, sess, &twin);
}
int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
@@ -978,13 +987,16 @@ out:
return rc;
}
-static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id,
+static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
int enc, u8 *key)
{
struct ksmbd_session *sess;
u8 *ses_enc_key;
- sess = ksmbd_session_lookup_all(conn, ses_id);
+ if (enc)
+ sess = work->sess;
+ else
+ sess = ksmbd_session_lookup_all(work->conn, ses_id);
if (!sess)
return -EINVAL;
@@ -1072,9 +1084,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
return sg;
}
-int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov,
unsigned int nvec, int enc)
{
+ struct ksmbd_conn *conn = work->conn;
struct smb2_transform_hdr *tr_hdr = smb2_get_msg(iov[0].iov_base);
unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
int rc;
@@ -1088,7 +1101,7 @@ int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
struct ksmbd_crypto_ctx *ctx;
- rc = ksmbd_get_encryption_key(conn,
+ rc = ksmbd_get_encryption_key(work,
le64_to_cpu(tr_hdr->SessionId),
enc,
key);
diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h
index 95629651cf26..362b6159a6cf 100644
--- a/fs/ksmbd/auth.h
+++ b/fs/ksmbd/auth.h
@@ -33,13 +33,15 @@
struct ksmbd_session;
struct ksmbd_conn;
+struct ksmbd_work;
struct kvec;
-int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov,
+int ksmbd_crypt_message(struct ksmbd_work *work, struct kvec *iov,
unsigned int nvec, int enc);
void ksmbd_copy_gss_neg_header(void *buf);
-int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2,
- int blen, char *domain_name, char *cryptkey);
+int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ struct ntlmv2_resp *ntlmv2, int blen, char *domain_name,
+ char *cryptkey);
int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
int blob_len, struct ksmbd_conn *conn,
struct ksmbd_session *sess);
@@ -58,8 +60,10 @@ int ksmbd_gen_smb30_signingkey(struct ksmbd_session *sess,
struct ksmbd_conn *conn);
int ksmbd_gen_smb311_signingkey(struct ksmbd_session *sess,
struct ksmbd_conn *conn);
-int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess);
-int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess);
+int ksmbd_gen_smb30_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
+int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
__u8 *pi_hash);
int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 208d2cff7bd3..12be8386446a 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -36,6 +36,7 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)
list_del(&conn->conns_list);
write_unlock(&conn_list_lock);
+ xa_destroy(&conn->sessions);
kvfree(conn->request_buf);
kfree(conn->preauth_info);
kfree(conn);
@@ -59,19 +60,26 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->local_nls = load_nls("utf8");
if (!conn->local_nls)
conn->local_nls = load_nls_default();
+ if (IS_ENABLED(CONFIG_UNICODE))
+ conn->um = utf8_load(UNICODE_AGE(12, 1, 0));
+ else
+ conn->um = ERR_PTR(-EOPNOTSUPP);
+ if (IS_ERR(conn->um))
+ conn->um = NULL;
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
conn->total_credits = 1;
- conn->outstanding_credits = 1;
+ conn->outstanding_credits = 0;
init_waitqueue_head(&conn->req_running_q);
+ init_waitqueue_head(&conn->r_count_q);
INIT_LIST_HEAD(&conn->conns_list);
- INIT_LIST_HEAD(&conn->sessions);
INIT_LIST_HEAD(&conn->requests);
INIT_LIST_HEAD(&conn->async_requests);
spin_lock_init(&conn->request_lock);
spin_lock_init(&conn->credits_lock);
ida_init(&conn->async_ida);
+ xa_init(&conn->sessions);
spin_lock_init(&conn->llist_lock);
INIT_LIST_HEAD(&conn->lock_list);
@@ -164,7 +172,6 @@ int ksmbd_conn_write(struct ksmbd_work *work)
struct kvec iov[3];
int iov_idx = 0;
- ksmbd_conn_try_dequeue_request(work);
if (!work->response_buf) {
pr_err("NULL response header\n");
return -EINVAL;
@@ -205,31 +212,31 @@ int ksmbd_conn_write(struct ksmbd_work *work)
return 0;
}
-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key, u64 remote_offset,
- u32 remote_len)
+int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
int ret = -EINVAL;
if (conn->transport->ops->rdma_read)
ret = conn->transport->ops->rdma_read(conn->transport,
buf, buflen,
- remote_key, remote_offset,
- remote_len);
+ desc, desc_len);
return ret;
}
-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key,
- u64 remote_offset, u32 remote_len)
+int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
int ret = -EINVAL;
if (conn->transport->ops->rdma_write)
ret = conn->transport->ops->rdma_write(conn->transport,
buf, buflen,
- remote_key, remote_offset,
- remote_len);
+ desc, desc_len);
return ret;
}
@@ -346,9 +353,11 @@ int ksmbd_conn_handler_loop(void *p)
out:
/* Wait till all reference dropped to the Server object*/
- while (atomic_read(&conn->r_count) > 0)
- schedule_timeout(HZ);
+ wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0);
+
+ if (IS_ENABLED(CONFIG_UNICODE))
+ utf8_unload(conn->um);
unload_nls(conn->local_nls);
if (default_conn_ops.terminate_fn)
default_conn_ops.terminate_fn(conn);
diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
index 7a59aacb5daa..3643354a3fa7 100644
--- a/fs/ksmbd/connection.h
+++ b/fs/ksmbd/connection.h
@@ -14,19 +14,13 @@
#include <net/request_sock.h>
#include <linux/kthread.h>
#include <linux/nls.h>
+#include <linux/unicode.h>
#include "smb_common.h"
#include "ksmbd_work.h"
#define KSMBD_SOCKET_BACKLOG 16
-/*
- * WARNING
- *
- * This is nothing but a HACK. Session status should move to channel
- * or to session. As of now we have 1 tcp_conn : 1 ksmbd_session, but
- * we need to change it to 1 tcp_conn : N ksmbd_sessions.
- */
enum {
KSMBD_SESS_NEW = 0,
KSMBD_SESS_GOOD,
@@ -53,9 +47,10 @@ struct ksmbd_conn {
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
+ struct unicode_map *um;
struct list_head conns_list;
/* smb session 1 per user */
- struct list_head sessions;
+ struct xarray sessions;
unsigned long last_active;
/* How many request are running currently */
atomic_t req_running;
@@ -65,6 +60,7 @@ struct ksmbd_conn {
unsigned int outstanding_credits;
spinlock_t credits_lock;
wait_queue_head_t req_running_q;
+ wait_queue_head_t r_count_q;
/* Lock to protect requests list*/
spinlock_t request_lock;
struct list_head requests;
@@ -122,11 +118,14 @@ struct ksmbd_transport_ops {
int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov,
int size, bool need_invalidate_rkey,
unsigned int remote_key);
- int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len,
- u32 remote_key, u64 remote_offset, u32 remote_len);
- int (*rdma_write)(struct ksmbd_transport *t, void *buf,
- unsigned int len, u32 remote_key, u64 remote_offset,
- u32 remote_len);
+ int (*rdma_read)(struct ksmbd_transport *t,
+ void *buf, unsigned int len,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
+ int (*rdma_write)(struct ksmbd_transport *t,
+ void *buf, unsigned int len,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
};
struct ksmbd_transport {
@@ -148,12 +147,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void);
void ksmbd_conn_free(struct ksmbd_conn *conn);
bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
int ksmbd_conn_write(struct ksmbd_work *work);
-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key, u64 remote_offset,
- u32 remote_len);
-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
- unsigned int buflen, u32 remote_key, u64 remote_offset,
- u32 remote_len);
+int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
+int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len);
void ksmbd_conn_enqueue_request(struct ksmbd_work *work);
int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops);
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index ebe6ca08467a..ff07c67f4565 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -104,7 +104,8 @@ struct ksmbd_startup_request {
*/
__u32 sub_auth[3]; /* Subauth value for Security ID */
__u32 smb2_max_credits; /* MAX credits */
- __u32 reserved[128]; /* Reserved room */
+ __u32 smbd_max_io_size; /* smbd read write size */
+ __u32 reserved[127]; /* Reserved room */
__u32 ifc_list_sz; /* interfaces list size */
__s8 ____payload[];
};
@@ -162,7 +163,8 @@ struct ksmbd_share_config_response {
__u16 force_directory_mode;
__u16 force_uid;
__u16 force_gid;
- __u32 reserved[128]; /* Reserved room */
+ __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME];
+ __u32 reserved[112]; /* Reserved room */
__u32 veto_list_sz;
__s8 ____payload[];
};
@@ -348,6 +350,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_SHARE_FLAG_STREAMS BIT(11)
#define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12)
#define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13)
+#define KSMBD_SHARE_FLAG_UPDATE BIT(14)
/*
* Tree connect request flags.
@@ -363,6 +366,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_TREE_CONN_FLAG_READ_ONLY BIT(1)
#define KSMBD_TREE_CONN_FLAG_WRITABLE BIT(2)
#define KSMBD_TREE_CONN_FLAG_ADMIN_ACCOUNT BIT(3)
+#define KSMBD_TREE_CONN_FLAG_UPDATE BIT(4)
/*
* RPC over IPC.
diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c
index cb72d30f5b71..328a412259dc 100644
--- a/fs/ksmbd/mgmt/share_config.c
+++ b/fs/ksmbd/mgmt/share_config.c
@@ -16,6 +16,7 @@
#include "user_config.h"
#include "user_session.h"
#include "../transport_ipc.h"
+#include "../misc.h"
#define SHARE_HASH_BITS 3
static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS);
@@ -26,7 +27,7 @@ struct ksmbd_veto_pattern {
struct list_head list;
};
-static unsigned int share_name_hash(char *name)
+static unsigned int share_name_hash(const char *name)
{
return jhash(name, strlen(name), 0);
}
@@ -51,12 +52,16 @@ static void kill_share(struct ksmbd_share_config *share)
kfree(share);
}
-void __ksmbd_share_config_put(struct ksmbd_share_config *share)
+void ksmbd_share_config_del(struct ksmbd_share_config *share)
{
down_write(&shares_table_lock);
hash_del(&share->hlist);
up_write(&shares_table_lock);
+}
+void __ksmbd_share_config_put(struct ksmbd_share_config *share)
+{
+ ksmbd_share_config_del(share);
kill_share(share);
}
@@ -68,7 +73,7 @@ __get_share_config(struct ksmbd_share_config *share)
return share;
}
-static struct ksmbd_share_config *__share_lookup(char *name)
+static struct ksmbd_share_config *__share_lookup(const char *name)
{
struct ksmbd_share_config *share;
unsigned int key = share_name_hash(name);
@@ -115,7 +120,8 @@ static int parse_veto_list(struct ksmbd_share_config *share,
return 0;
}
-static struct ksmbd_share_config *share_config_request(char *name)
+static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
+ const char *name)
{
struct ksmbd_share_config_response *resp;
struct ksmbd_share_config *share = NULL;
@@ -129,6 +135,19 @@ static struct ksmbd_share_config *share_config_request(char *name)
if (resp->flags == KSMBD_SHARE_FLAG_INVALID)
goto out;
+ if (*resp->share_name) {
+ char *cf_resp_name;
+ bool equal;
+
+ cf_resp_name = ksmbd_casefold_sharename(um, resp->share_name);
+ if (IS_ERR(cf_resp_name))
+ goto out;
+ equal = !strcmp(cf_resp_name, name);
+ kfree(cf_resp_name);
+ if (!equal)
+ goto out;
+ }
+
share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL);
if (!share)
goto out;
@@ -186,20 +205,11 @@ out:
return share;
}
-static void strtolower(char *share_name)
-{
- while (*share_name) {
- *share_name = tolower(*share_name);
- share_name++;
- }
-}
-
-struct ksmbd_share_config *ksmbd_share_config_get(char *name)
+struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+ const char *name)
{
struct ksmbd_share_config *share;
- strtolower(name);
-
down_read(&shares_table_lock);
share = __share_lookup(name);
if (share)
@@ -208,7 +218,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(char *name)
if (share)
return share;
- return share_config_request(name);
+ return share_config_request(um, name);
}
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
@@ -222,17 +232,3 @@ bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
}
return false;
}
-
-void ksmbd_share_configs_cleanup(void)
-{
- struct ksmbd_share_config *share;
- struct hlist_node *tmp;
- int i;
-
- down_write(&shares_table_lock);
- hash_for_each_safe(shares_table, i, tmp, share, hlist) {
- hash_del(&share->hlist);
- kill_share(share);
- }
- up_write(&shares_table_lock);
-}
diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h
index 953befc94e84..3fd338293942 100644
--- a/fs/ksmbd/mgmt/share_config.h
+++ b/fs/ksmbd/mgmt/share_config.h
@@ -9,6 +9,7 @@
#include <linux/workqueue.h>
#include <linux/hashtable.h>
#include <linux/path.h>
+#include <linux/unicode.h>
struct ksmbd_share_config {
char *name;
@@ -64,6 +65,7 @@ static inline int test_share_config_flag(struct ksmbd_share_config *share,
return share->flags & flag;
}
+void ksmbd_share_config_del(struct ksmbd_share_config *share);
void __ksmbd_share_config_put(struct ksmbd_share_config *share);
static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
@@ -73,9 +75,8 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
__ksmbd_share_config_put(share);
}
-struct ksmbd_share_config *ksmbd_share_config_get(char *name);
+struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+ const char *name);
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
const char *filename);
-void ksmbd_share_configs_cleanup(void);
-
#endif /* __SHARE_CONFIG_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c
index 0d28e723a28c..8ce17b3fb8da 100644
--- a/fs/ksmbd/mgmt/tree_connect.c
+++ b/fs/ksmbd/mgmt/tree_connect.c
@@ -16,16 +16,17 @@
#include "user_session.h"
struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name)
+ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ const char *share_name)
{
- struct ksmbd_tree_conn_status status = {-EINVAL, NULL};
+ struct ksmbd_tree_conn_status status = {-ENOENT, NULL};
struct ksmbd_tree_connect_response *resp = NULL;
struct ksmbd_share_config *sc;
struct ksmbd_tree_connect *tree_conn = NULL;
struct sockaddr *peer_addr;
int ret;
- sc = ksmbd_share_config_get(share_name);
+ sc = ksmbd_share_config_get(conn->um, share_name);
if (!sc)
return status;
@@ -41,7 +42,7 @@ ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name)
goto out_error;
}
- peer_addr = KSMBD_TCP_PEER_SOCKADDR(sess->conn);
+ peer_addr = KSMBD_TCP_PEER_SOCKADDR(conn);
resp = ksmbd_ipc_tree_connect_request(sess,
sc,
tree_conn,
@@ -56,6 +57,20 @@ ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name)
goto out_error;
tree_conn->flags = resp->connection_flags;
+ if (test_tree_conn_flag(tree_conn, KSMBD_TREE_CONN_FLAG_UPDATE)) {
+ struct ksmbd_share_config *new_sc;
+
+ ksmbd_share_config_del(sc);
+ new_sc = ksmbd_share_config_get(conn->um, share_name);
+ if (!new_sc) {
+ pr_err("Failed to update stale share config\n");
+ status.ret = -ESTALE;
+ goto out_error;
+ }
+ ksmbd_share_config_put(sc);
+ sc = new_sc;
+ }
+
tree_conn->user = sess->user;
tree_conn->share_conf = sc;
status.tree_conn = tree_conn;
diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h
index 18e2a996e0aa..0f97ddc1e39c 100644
--- a/fs/ksmbd/mgmt/tree_connect.h
+++ b/fs/ksmbd/mgmt/tree_connect.h
@@ -12,6 +12,7 @@
struct ksmbd_share_config;
struct ksmbd_user;
+struct ksmbd_conn;
struct ksmbd_tree_connect {
int id;
@@ -40,7 +41,8 @@ static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn,
struct ksmbd_session;
struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name);
+ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
+ const char *share_name);
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn);
diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
index 8d8ffd8c6f19..3fa2139a0b30 100644
--- a/fs/ksmbd/mgmt/user_session.c
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -32,11 +32,13 @@ static void free_channel_list(struct ksmbd_session *sess)
{
struct channel *chann, *tmp;
+ write_lock(&sess->chann_lock);
list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list,
chann_list) {
list_del(&chann->chann_list);
kfree(chann);
}
+ write_unlock(&sess->chann_lock);
}
static void __session_rpc_close(struct ksmbd_session *sess,
@@ -149,11 +151,6 @@ void ksmbd_session_destroy(struct ksmbd_session *sess)
if (!sess)
return;
- if (!atomic_dec_and_test(&sess->refcnt))
- return;
-
- list_del(&sess->sessions_entry);
-
down_write(&sessions_table_lock);
hash_del(&sess->hlist);
up_write(&sessions_table_lock);
@@ -181,53 +178,70 @@ static struct ksmbd_session *__session_lookup(unsigned long long id)
return NULL;
}
-void ksmbd_session_register(struct ksmbd_conn *conn,
- struct ksmbd_session *sess)
+int ksmbd_session_register(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess)
{
- sess->conn = conn;
- list_add(&sess->sessions_entry, &conn->sessions);
+ sess->dialect = conn->dialect;
+ memcpy(sess->ClientGUID, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
+ return xa_err(xa_store(&conn->sessions, sess->id, sess, GFP_KERNEL));
}
-void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
+static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess)
{
- struct ksmbd_session *sess;
-
- while (!list_empty(&conn->sessions)) {
- sess = list_entry(conn->sessions.next,
- struct ksmbd_session,
- sessions_entry);
+ struct channel *chann, *tmp;
- ksmbd_session_destroy(sess);
+ write_lock(&sess->chann_lock);
+ list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list,
+ chann_list) {
+ if (chann->conn == conn) {
+ list_del(&chann->chann_list);
+ kfree(chann);
+ write_unlock(&sess->chann_lock);
+ return 0;
+ }
}
-}
+ write_unlock(&sess->chann_lock);
-static bool ksmbd_session_id_match(struct ksmbd_session *sess,
- unsigned long long id)
-{
- return sess->id == id;
+ return -ENOENT;
}
-struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
- unsigned long long id)
+void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
{
- struct ksmbd_session *sess = NULL;
+ struct ksmbd_session *sess;
- list_for_each_entry(sess, &conn->sessions, sessions_entry) {
- if (ksmbd_session_id_match(sess, id))
- return sess;
+ if (conn->binding) {
+ int bkt;
+
+ down_write(&sessions_table_lock);
+ hash_for_each(sessions_table, bkt, sess, hlist) {
+ if (!ksmbd_chann_del(conn, sess)) {
+ up_write(&sessions_table_lock);
+ goto sess_destroy;
+ }
+ }
+ up_write(&sessions_table_lock);
+ } else {
+ unsigned long id;
+
+ xa_for_each(&conn->sessions, id, sess) {
+ if (!ksmbd_chann_del(conn, sess))
+ goto sess_destroy;
+ }
}
- return NULL;
-}
-int get_session(struct ksmbd_session *sess)
-{
- return atomic_inc_not_zero(&sess->refcnt);
+ return;
+
+sess_destroy:
+ if (list_empty(&sess->ksmbd_chann_list)) {
+ xa_erase(&conn->sessions, sess->id);
+ ksmbd_session_destroy(sess);
+ }
}
-void put_session(struct ksmbd_session *sess)
+struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
+ unsigned long long id)
{
- if (atomic_dec_and_test(&sess->refcnt))
- pr_err("get/%s seems to be mismatched.", __func__);
+ return xa_load(&conn->sessions, id);
}
struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id)
@@ -236,10 +250,6 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id)
down_read(&sessions_table_lock);
sess = __session_lookup(id);
- if (sess) {
- if (!get_session(sess))
- sess = NULL;
- }
up_read(&sessions_table_lock);
return sess;
@@ -253,6 +263,8 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
sess = ksmbd_session_lookup(conn, id);
if (!sess && conn->binding)
sess = ksmbd_session_lookup_slowpath(id);
+ if (sess && sess->state != SMB2_SESSION_VALID)
+ sess = NULL;
return sess;
}
@@ -314,12 +326,11 @@ static struct ksmbd_session *__session_create(int protocol)
goto error;
set_session_flag(sess, protocol);
- INIT_LIST_HEAD(&sess->sessions_entry);
xa_init(&sess->tree_conns);
INIT_LIST_HEAD(&sess->ksmbd_chann_list);
INIT_LIST_HEAD(&sess->rpc_handle_list);
sess->sequence_number = 1;
- atomic_set(&sess->refcnt, 1);
+ rwlock_init(&sess->chann_lock);
switch (protocol) {
case CIFDS_SESSION_FLAG_SMB2:
diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h
index e241f16a3851..8934b8ee275b 100644
--- a/fs/ksmbd/mgmt/user_session.h
+++ b/fs/ksmbd/mgmt/user_session.h
@@ -33,8 +33,10 @@ struct preauth_session {
struct ksmbd_session {
u64 id;
+ __u16 dialect;
+ char ClientGUID[SMB2_CLIENT_GUID_SIZE];
+
struct ksmbd_user *user;
- struct ksmbd_conn *conn;
unsigned int sequence_number;
unsigned int flags;
@@ -48,6 +50,7 @@ struct ksmbd_session {
char sess_key[CIFS_KEY_SIZE];
struct hlist_node hlist;
+ rwlock_t chann_lock;
struct list_head ksmbd_chann_list;
struct xarray tree_conns;
struct ida tree_conn_ida;
@@ -57,9 +60,7 @@ struct ksmbd_session {
__u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE];
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
- struct list_head sessions_entry;
struct ksmbd_file_table file_table;
- atomic_t refcnt;
};
static inline int test_session_flag(struct ksmbd_session *sess, int bit)
@@ -84,8 +85,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess);
struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id);
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
unsigned long long id);
-void ksmbd_session_register(struct ksmbd_conn *conn,
- struct ksmbd_session *sess);
+int ksmbd_session_register(struct ksmbd_conn *conn,
+ struct ksmbd_session *sess);
void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
unsigned long long id);
@@ -100,6 +101,4 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id);
int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name);
void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id);
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id);
-int get_session(struct ksmbd_session *sess);
-void put_session(struct ksmbd_session *sess);
#endif /* __USER_SESSION_MANAGEMENT_H__ */
diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c
index 60e7ac62c917..9e8afaa686e3 100644
--- a/fs/ksmbd/misc.c
+++ b/fs/ksmbd/misc.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/xattr.h>
#include <linux/fs.h>
+#include <linux/unicode.h>
#include "misc.h"
#include "smb_common.h"
@@ -20,7 +21,7 @@
* wildcard '*' and '?'
* TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR
*
- * @string: string to compare with a pattern
+ * @str: string to compare with a pattern
* @len: string length
* @pattern: pattern string which might include wildcard '*' and '?'
*
@@ -152,25 +153,47 @@ out:
/**
* convert_to_nt_pathname() - extract and return windows path string
* whose share directory prefix was removed from file path
- * @filename : unix filename
- * @sharepath: share path string
+ * @share: ksmbd_share_config pointer
+ * @path: path to report
*
* Return : windows path string or error
*/
-char *convert_to_nt_pathname(char *filename)
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ const struct path *path)
{
- char *ab_pathname;
+ char *pathname, *ab_pathname, *nt_pathname;
+ int share_path_len = share->path_sz;
- if (strlen(filename) == 0)
- filename = "\\";
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathname)
+ return ERR_PTR(-EACCES);
- ab_pathname = kstrdup(filename, GFP_KERNEL);
- if (!ab_pathname)
- return NULL;
+ ab_pathname = d_path(path, pathname, PATH_MAX);
+ if (IS_ERR(ab_pathname)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ if (strncmp(ab_pathname, share->path, share_path_len)) {
+ nt_pathname = ERR_PTR(-EACCES);
+ goto free_pathname;
+ }
+
+ nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, GFP_KERNEL);
+ if (!nt_pathname) {
+ nt_pathname = ERR_PTR(-ENOMEM);
+ goto free_pathname;
+ }
+ if (ab_pathname[share_path_len] == '\0')
+ strcpy(nt_pathname, "/");
+ strcat(nt_pathname, &ab_pathname[share_path_len]);
+
+ ksmbd_conv_path_to_windows(nt_pathname);
- ksmbd_conv_path_to_windows(ab_pathname);
- return ab_pathname;
+free_pathname:
+ kfree(pathname);
+ return nt_pathname;
}
int get_nlink(struct kstat *st)
@@ -204,32 +227,59 @@ void ksmbd_conv_path_to_windows(char *path)
strreplace(path, '/', '\\');
}
+char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name)
+{
+ char *cf_name;
+ int cf_len;
+
+ cf_name = kzalloc(KSMBD_REQ_MAX_SHARE_NAME, GFP_KERNEL);
+ if (!cf_name)
+ return ERR_PTR(-ENOMEM);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && um) {
+ const struct qstr q_name = {.name = name, .len = strlen(name)};
+
+ cf_len = utf8_casefold(um, &q_name, cf_name,
+ KSMBD_REQ_MAX_SHARE_NAME);
+ if (cf_len < 0)
+ goto out_ascii;
+
+ return cf_name;
+ }
+
+out_ascii:
+ cf_len = strscpy(cf_name, name, KSMBD_REQ_MAX_SHARE_NAME);
+ if (cf_len < 0) {
+ kfree(cf_name);
+ return ERR_PTR(-E2BIG);
+ }
+
+ for (; *cf_name; ++cf_name)
+ *cf_name = isascii(*cf_name) ? tolower(*cf_name) : *cf_name;
+ return cf_name - cf_len;
+}
+
/**
* ksmbd_extract_sharename() - get share name from tree connect request
* @treename: buffer containing tree name and share name
*
* Return: share name on success, otherwise error
*/
-char *ksmbd_extract_sharename(char *treename)
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename)
{
- char *name = treename;
- char *dst;
- char *pos = strrchr(name, '\\');
+ const char *name = treename, *pos = strrchr(name, '\\');
if (pos)
name = (pos + 1);
/* caller has to free the memory */
- dst = kstrdup(name, GFP_KERNEL);
- if (!dst)
- return ERR_PTR(-ENOMEM);
- return dst;
+ return ksmbd_casefold_sharename(um, name);
}
/**
* convert_to_unix_name() - convert windows name to unix format
- * @path: name to be converted
- * @tid: tree id of mathing share
+ * @share: ksmbd_share_config pointer
+ * @name: file name that is relative to share
*
* Return: converted name on success, otherwise NULL
*/
diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h
index 253366bd0951..1facfcd21200 100644
--- a/fs/ksmbd/misc.h
+++ b/fs/ksmbd/misc.h
@@ -14,12 +14,14 @@ struct ksmbd_file;
int match_pattern(const char *str, size_t len, const char *pattern);
int ksmbd_validate_filename(char *filename);
int parse_stream_name(char *filename, char **stream_name, int *s_type);
-char *convert_to_nt_pathname(char *filename);
+char *convert_to_nt_pathname(struct ksmbd_share_config *share,
+ const struct path *path);
int get_nlink(struct kstat *st);
void ksmbd_conv_path_to_unix(char *path);
void ksmbd_strip_last_slash(char *path);
void ksmbd_conv_path_to_windows(char *path);
-char *ksmbd_extract_sharename(char *treename);
+char *ksmbd_casefold_sharename(struct unicode_map *um, const char *name);
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename);
char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name);
#define KSMBD_DIR_INFO_ALIGNMENT 8
diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c
index 5052be9261d9..0ae8d08d85a8 100644
--- a/fs/ksmbd/ndr.c
+++ b/fs/ksmbd/ndr.c
@@ -345,6 +345,8 @@ int ndr_encode_posix_acl(struct ndr *n,
{
unsigned int ref_id = 0x00020000;
int ret;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
n->offset = 0;
n->length = 1024;
@@ -372,10 +374,12 @@ int ndr_encode_posix_acl(struct ndr *n,
if (ret)
return ret;
- ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode)));
+ vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ ret = ndr_write_int64(n, from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid)));
if (ret)
return ret;
- ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode)));
+ vfsgid = i_gid_into_vfsgid(user_ns, inode);
+ ret = ndr_write_int64(n, from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid)));
if (ret)
return ret;
ret = ndr_write_int32(n, inode->i_mode);
diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c
index 23871b18a429..d7d47b82451d 100644
--- a/fs/ksmbd/oplock.c
+++ b/fs/ksmbd/oplock.c
@@ -30,6 +30,7 @@ static DEFINE_RWLOCK(lease_list_lock);
static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
u64 id, __u16 Tid)
{
+ struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
struct oplock_info *opinfo;
@@ -38,7 +39,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
return NULL;
opinfo->sess = sess;
- opinfo->conn = sess->conn;
+ opinfo->conn = conn;
opinfo->level = SMB2_OPLOCK_LEVEL_NONE;
opinfo->op_state = OPLOCK_STATE_NONE;
opinfo->pending_break = 0;
@@ -615,18 +616,13 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
struct ksmbd_file *fp;
fp = ksmbd_lookup_durable_fd(br_info->fid);
- if (!fp) {
- atomic_dec(&conn->r_count);
- ksmbd_free_work_struct(work);
- return;
- }
+ if (!fp)
+ goto out;
if (allocate_oplock_break_buf(work)) {
pr_err("smb2_allocate_rsp_buf failed! ");
- atomic_dec(&conn->r_count);
ksmbd_fd_put(work, fp);
- ksmbd_free_work_struct(work);
- return;
+ goto out;
}
rsp_hdr = smb2_get_msg(work->response_buf);
@@ -667,8 +663,16 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
ksmbd_fd_put(work, fp);
ksmbd_conn_write(work);
+
+out:
ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
+ /*
+ * Checking waitqueue to dropping pending requests on
+ * disconnection. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
+ wake_up(&conn->r_count_q);
}
/**
@@ -731,9 +735,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
if (allocate_oplock_break_buf(work)) {
ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! ");
- ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
- return;
+ goto out;
}
rsp_hdr = smb2_get_msg(work->response_buf);
@@ -771,8 +773,16 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
inc_rfc1001_len(work->response_buf, 44);
ksmbd_conn_write(work);
+
+out:
ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
+ /*
+ * Checking waitqueue to dropping pending requests on
+ * disconnection. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
+ wake_up(&conn->r_count_q);
}
/**
@@ -972,7 +982,7 @@ int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
}
list_for_each_entry(lb, &lease_table_list, l_entry) {
- if (!memcmp(lb->client_guid, sess->conn->ClientGUID,
+ if (!memcmp(lb->client_guid, sess->ClientGUID,
SMB2_CLIENT_GUID_SIZE))
goto found;
}
@@ -988,7 +998,7 @@ found:
rcu_read_unlock();
if (opinfo->o_fp->f_ci == ci)
goto op_next;
- err = compare_guid_key(opinfo, sess->conn->ClientGUID,
+ err = compare_guid_key(opinfo, sess->ClientGUID,
lctx->lease_key);
if (err) {
err = -EINVAL;
@@ -1122,7 +1132,7 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
struct oplock_info *m_opinfo;
/* is lease already granted ? */
- m_opinfo = same_client_has_lease(ci, sess->conn->ClientGUID,
+ m_opinfo = same_client_has_lease(ci, sess->ClientGUID,
lctx);
if (m_opinfo) {
copy_lease(m_opinfo, opinfo);
@@ -1240,7 +1250,7 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
{
struct oplock_info *op, *brk_op;
struct ksmbd_inode *ci;
- struct ksmbd_conn *conn = work->sess->conn;
+ struct ksmbd_conn *conn = work->conn;
if (!test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_OPLOCKS))
@@ -1599,12 +1609,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
struct create_posix_rsp *buf;
struct inode *inode = file_inode(fp->filp);
struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode);
buf = (struct create_posix_rsp *)cc;
memset(buf, 0, sizeof(struct create_posix_rsp));
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct create_posix_rsp, nlink));
- buf->ccontext.DataLength = cpu_to_le32(52);
+ /*
+ * DataLength = nlink(4) + reparse_tag(4) + mode(4) +
+ * domain sid(28) + unix group sid(16).
+ */
+ buf->ccontext.DataLength = cpu_to_le32(56);
buf->ccontext.NameOffset = cpu_to_le16(offsetof
(struct create_posix_rsp, Name));
buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN);
@@ -1628,13 +1644,18 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp)
buf->nlink = cpu_to_le32(inode->i_nlink);
buf->reparse_tag = cpu_to_le32(fp->volatile_id);
- buf->mode = cpu_to_le32(inode->i_mode);
- id_to_sid(from_kuid_munged(&init_user_ns,
- i_uid_into_mnt(user_ns, inode)),
- SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]);
- id_to_sid(from_kgid_munged(&init_user_ns,
- i_gid_into_mnt(user_ns, inode)),
- SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]);
+ buf->mode = cpu_to_le32(inode->i_mode & 0777);
+ /*
+ * SidBuffer(44) contain two sids(Domain sid(28), UNIX group sid(16)).
+ * Domain sid(28) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 4(num_subauth)) + RID(4).
+ * UNIX group id(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
+ id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)),
+ SIDOWNER, (struct smb_sid *)&buf->SidBuffer[0]);
+ id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)),
+ SIDUNIX_GROUP, (struct smb_sid *)&buf->SidBuffer[28]);
}
/*
@@ -1694,33 +1715,3 @@ out:
read_unlock(&lease_list_lock);
return ret_op;
}
-
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name)
-{
- struct oplock_info *opinfo = opinfo_get(fp);
- int ret = 0;
-
- if (opinfo && opinfo->is_lease) {
- if (!lctx) {
- pr_err("open does not include lease\n");
- ret = -EBADF;
- goto out;
- }
- if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
- SMB2_LEASE_KEY_SIZE)) {
- pr_err("invalid lease key\n");
- ret = -EBADF;
- goto out;
- }
- if (name && strcmp(fp->filename, name)) {
- pr_err("invalid name reconnect %s\n", name);
- ret = -EINVAL;
- goto out;
- }
- }
-out:
- if (opinfo)
- opinfo_put(opinfo);
- return ret;
-}
diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h
index 0cf7a2b5bbc0..09753448f779 100644
--- a/fs/ksmbd/oplock.h
+++ b/fs/ksmbd/oplock.h
@@ -124,6 +124,4 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn,
int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci,
struct lease_ctx_info *lctx);
void destroy_lease_table(struct ksmbd_conn *conn);
-int smb2_check_durable_oplock(struct ksmbd_file *fp,
- struct lease_ctx_info *lctx, char *name);
#endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index 4cd03d661df0..a0d635304754 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -235,10 +235,8 @@ send:
if (work->sess && work->sess->enc && work->encrypted &&
conn->ops->encrypt_resp) {
rc = conn->ops->encrypt_resp(work);
- if (rc < 0) {
+ if (rc < 0)
conn->ops->set_rsp_status(work, STATUS_DATA_ERROR);
- goto send;
- }
}
ksmbd_conn_write(work);
@@ -261,7 +259,13 @@ static void handle_ksmbd_work(struct work_struct *wk)
ksmbd_conn_try_dequeue_request(work);
ksmbd_free_work_struct(work);
- atomic_dec(&conn->r_count);
+ /*
+ * Checking waitqueue to dropping pending requests on
+ * disconnection. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
+ wake_up(&conn->r_count_q);
}
/**
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index 4a9460153b59..6e25ace36568 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -90,11 +90,6 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*off = 0;
*len = 0;
- /* error reqeusts do not have data area */
- if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
- (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE)
- return ret;
-
/*
* Following commands have data areas so we have to get the location
* of the data buffer offset and data buffer length for the particular
@@ -136,8 +131,11 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
*len = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoLength);
break;
case SMB2_WRITE:
- if (((struct smb2_write_req *)hdr)->DataOffset) {
- *off = le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset);
+ if (((struct smb2_write_req *)hdr)->DataOffset ||
+ ((struct smb2_write_req *)hdr)->Length) {
+ *off = max_t(unsigned int,
+ le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset),
+ offsetof(struct smb2_write_req, Buffer));
*len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length);
break;
}
@@ -338,7 +336,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn,
ret = 1;
}
- if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) {
+ if ((u64)conn->outstanding_credits + credit_charge > conn->total_credits) {
ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n",
credit_charge, conn->outstanding_credits);
ret = 1;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 3bf6c56c654c..b2fc85d440d0 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -11,6 +11,7 @@
#include <linux/statfs.h>
#include <linux/ethtool.h>
#include <linux/falloc.h>
+#include <linux/mount.h>
#include "glob.h"
#include "smbfsctl.h"
@@ -534,9 +535,10 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work)
struct smb2_query_info_req *req;
req = smb2_get_msg(work->request_buf);
- if (req->InfoType == SMB2_O_INFO_FILE &&
- (req->FileInfoClass == FILE_FULL_EA_INFORMATION ||
- req->FileInfoClass == FILE_ALL_INFORMATION))
+ if ((req->InfoType == SMB2_O_INFO_FILE &&
+ (req->FileInfoClass == FILE_FULL_EA_INFORMATION ||
+ req->FileInfoClass == FILE_ALL_INFORMATION)) ||
+ req->InfoType == SMB2_O_INFO_SECURITY)
sz = large_sz;
}
@@ -587,10 +589,12 @@ int smb2_check_user_session(struct ksmbd_work *work)
return -EINVAL;
}
-static void destroy_previous_session(struct ksmbd_user *user, u64 id)
+static void destroy_previous_session(struct ksmbd_conn *conn,
+ struct ksmbd_user *user, u64 id)
{
struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id);
struct ksmbd_user *prev_user;
+ struct channel *chann;
if (!prev_sess)
return;
@@ -600,13 +604,14 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id)
if (!prev_user ||
strcmp(user->name, prev_user->name) ||
user->passkey_sz != prev_user->passkey_sz ||
- memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) {
- put_session(prev_sess);
+ memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
return;
- }
- put_session(prev_sess);
- ksmbd_session_destroy(prev_sess);
+ prev_sess->state = SMB2_SESSION_EXPIRED;
+ write_lock(&prev_sess->chann_lock);
+ list_for_each_entry(chann, &prev_sess->ksmbd_chann_list, chann_list)
+ chann->conn->status = KSMBD_SESS_EXITING;
+ write_unlock(&prev_sess->chann_lock);
}
/**
@@ -920,7 +925,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
*
* Return: true if connection should be encrypted, else false
*/
-static bool smb3_encryption_negotiated(struct ksmbd_conn *conn)
+bool smb3_encryption_negotiated(struct ksmbd_conn *conn)
{
if (!conn->ops->generate_encryptionkey)
return false;
@@ -1138,12 +1143,16 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
status);
rsp->hdr.Status = status;
rc = -EINVAL;
+ kfree(conn->preauth_info);
+ conn->preauth_info = NULL;
goto err_out;
}
rc = init_smb3_11_server(conn);
if (rc < 0) {
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
+ kfree(conn->preauth_info);
+ conn->preauth_info = NULL;
goto err_out;
}
@@ -1438,7 +1447,7 @@ static int ntlm_authenticate(struct ksmbd_work *work)
/* Check for previous session */
prev_id = le64_to_cpu(req->PreviousSessionId);
if (prev_id && prev_id != sess->id)
- destroy_previous_session(user, prev_id);
+ destroy_previous_session(conn, user, prev_id);
if (sess->state == SMB2_SESSION_VALID) {
/*
@@ -1492,7 +1501,7 @@ static int ntlm_authenticate(struct ksmbd_work *work)
if (smb3_encryption_negotiated(conn) &&
!(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
- rc = conn->ops->generate_encryptionkey(sess);
+ rc = conn->ops->generate_encryptionkey(conn, sess);
if (rc) {
ksmbd_debug(SMB,
"SMB3 encryption key generation failed\n");
@@ -1509,7 +1518,9 @@ static int ntlm_authenticate(struct ksmbd_work *work)
binding_session:
if (conn->dialect >= SMB30_PROT_ID) {
+ read_lock(&sess->chann_lock);
chann = lookup_chann_list(sess, conn);
+ read_unlock(&sess->chann_lock);
if (!chann) {
chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
if (!chann)
@@ -1517,7 +1528,9 @@ binding_session:
chann->conn = conn;
INIT_LIST_HEAD(&chann->chann_list);
+ write_lock(&sess->chann_lock);
list_add(&chann->chann_list, &sess->ksmbd_chann_list);
+ write_unlock(&sess->chann_lock);
}
}
@@ -1560,7 +1573,7 @@ static int krb5_authenticate(struct ksmbd_work *work)
/* Check previous session */
prev_sess_id = le64_to_cpu(req->PreviousSessionId);
if (prev_sess_id && prev_sess_id != sess->id)
- destroy_previous_session(sess->user, prev_sess_id);
+ destroy_previous_session(conn, sess->user, prev_sess_id);
if (sess->state == SMB2_SESSION_VALID)
ksmbd_free_user(sess->user);
@@ -1579,7 +1592,7 @@ static int krb5_authenticate(struct ksmbd_work *work)
sess->sign = true;
if (smb3_encryption_negotiated(conn)) {
- retval = conn->ops->generate_encryptionkey(sess);
+ retval = conn->ops->generate_encryptionkey(conn, sess);
if (retval) {
ksmbd_debug(SMB,
"SMB3 encryption key generation failed\n");
@@ -1591,7 +1604,9 @@ static int krb5_authenticate(struct ksmbd_work *work)
}
if (conn->dialect >= SMB30_PROT_ID) {
+ read_lock(&sess->chann_lock);
chann = lookup_chann_list(sess, conn);
+ read_unlock(&sess->chann_lock);
if (!chann) {
chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
if (!chann)
@@ -1599,7 +1614,9 @@ static int krb5_authenticate(struct ksmbd_work *work)
chann->conn = conn;
INIT_LIST_HEAD(&chann->chann_list);
+ write_lock(&sess->chann_lock);
list_add(&chann->chann_list, &sess->ksmbd_chann_list);
+ write_unlock(&sess->chann_lock);
}
}
@@ -1649,7 +1666,9 @@ int smb2_sess_setup(struct ksmbd_work *work)
goto out_err;
}
rsp->hdr.SessionId = cpu_to_le64(sess->id);
- ksmbd_session_register(conn, sess);
+ rc = ksmbd_session_register(conn, sess);
+ if (rc)
+ goto out_err;
} else if (conn->dialect >= SMB30_PROT_ID &&
(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) &&
req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) {
@@ -1661,7 +1680,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
goto out_err;
}
- if (conn->dialect != sess->conn->dialect) {
+ if (conn->dialect != sess->dialect) {
rc = -EINVAL;
goto out_err;
}
@@ -1671,7 +1690,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
goto out_err;
}
- if (strncmp(conn->ClientGUID, sess->conn->ClientGUID,
+ if (strncmp(conn->ClientGUID, sess->ClientGUID,
SMB2_CLIENT_GUID_SIZE)) {
rc = -ENOENT;
goto out_err;
@@ -1827,6 +1846,7 @@ out_err:
if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION)
try_delay = true;
+ xa_erase(&conn->sessions, sess->id);
ksmbd_session_destroy(sess);
work->sess = NULL;
if (try_delay)
@@ -1863,7 +1883,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
goto out_err1;
}
- name = ksmbd_extract_sharename(treename);
+ name = ksmbd_extract_sharename(conn->um, treename);
if (IS_ERR(name)) {
status.ret = KSMBD_TREE_CONN_STATUS_ERROR;
goto out_err1;
@@ -1872,7 +1892,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n",
name, treename);
- status = ksmbd_tree_conn_connect(sess, name);
+ status = ksmbd_tree_conn_connect(conn, sess, name);
if (status.ret == KSMBD_TREE_CONN_STATUS_OK)
rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id);
else
@@ -1924,8 +1944,10 @@ out_err1:
rsp->hdr.Status = STATUS_SUCCESS;
rc = 0;
break;
+ case -ESTALE:
+ case -ENOENT:
case KSMBD_TREE_CONN_STATUS_NO_SHARE:
- rsp->hdr.Status = STATUS_BAD_NETWORK_PATH;
+ rsp->hdr.Status = STATUS_BAD_NETWORK_NAME;
break;
case -ENOMEM:
case KSMBD_TREE_CONN_STATUS_NOMEM:
@@ -2038,6 +2060,7 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
ksmbd_close_tree_conn_fds(work);
ksmbd_tree_conn_disconnect(sess, tcon);
+ work->tcon = NULL;
return 0;
}
@@ -2162,7 +2185,7 @@ out:
* Return: 0 on success, otherwise error
*/
static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
- struct path *path)
+ const struct path *path)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
char *attr_name = NULL, *value;
@@ -2249,7 +2272,7 @@ next:
return rc;
}
-static noinline int smb2_set_stream_name_xattr(struct path *path,
+static noinline int smb2_set_stream_name_xattr(const struct path *path,
struct ksmbd_file *fp,
char *stream_name, int s_type)
{
@@ -2288,7 +2311,7 @@ static noinline int smb2_set_stream_name_xattr(struct path *path,
return 0;
}
-static int smb2_remove_smb_xattrs(struct path *path)
+static int smb2_remove_smb_xattrs(const struct path *path)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
char *name, *xattr_list = NULL;
@@ -2307,22 +2330,22 @@ static int smb2_remove_smb_xattrs(struct path *path)
name += strlen(name) + 1) {
ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
- if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
- strncmp(&name[XATTR_USER_PREFIX_LEN], DOS_ATTRIBUTE_PREFIX,
- DOS_ATTRIBUTE_PREFIX_LEN) &&
- strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN))
- continue;
-
- err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, name);
- if (err)
- ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
+ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+ !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX,
+ STREAM_PREFIX_LEN)) {
+ err = ksmbd_vfs_remove_xattr(user_ns, path->dentry,
+ name);
+ if (err)
+ ksmbd_debug(SMB, "remove xattr failed : %s\n",
+ name);
+ }
}
out:
kvfree(xattr_list);
return err;
}
-static int smb2_create_truncate(struct path *path)
+static int smb2_create_truncate(const struct path *path)
{
int rc = vfs_truncate(path, 0);
@@ -2341,7 +2364,7 @@ static int smb2_create_truncate(struct path *path)
return rc;
}
-static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path,
+static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path *path,
struct ksmbd_file *fp)
{
struct xattr_dos_attrib da = {0};
@@ -2364,7 +2387,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path,
}
static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
- struct path *path, struct ksmbd_file *fp)
+ const struct path *path, struct ksmbd_file *fp)
{
struct xattr_dos_attrib da;
int rc;
@@ -2424,7 +2447,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name,
static int smb2_create_sd_buffer(struct ksmbd_work *work,
struct smb2_create_req *req,
- struct path *path)
+ const struct path *path)
{
struct create_context *context;
struct create_sd_buf_req *sd_buf;
@@ -2454,8 +2477,11 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
struct user_namespace *mnt_userns,
struct inode *inode)
{
- fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode);
- fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
+ fattr->cf_uid = vfsuid_into_kuid(vfsuid);
+ fattr->cf_gid = vfsgid_into_kgid(vfsgid);
fattr->cf_mode = inode->i_mode;
fattr->cf_acls = NULL;
fattr->cf_dacls = NULL;
@@ -2738,7 +2764,6 @@ int smb2_open(struct ksmbd_work *work)
} else {
file_present = true;
user_ns = mnt_user_ns(path.mnt);
- generic_fillattr(user_ns, d_inode(path.dentry), &stat);
}
if (stream_name) {
if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) {
@@ -2747,7 +2772,8 @@ int smb2_open(struct ksmbd_work *work)
rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
}
} else {
- if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) {
+ if (file_present && S_ISDIR(d_inode(path.dentry)->i_mode) &&
+ s_type == DATA_STREAM) {
rc = -EIO;
rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
}
@@ -2764,7 +2790,8 @@ int smb2_open(struct ksmbd_work *work)
}
if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE &&
- S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
+ S_ISDIR(d_inode(path.dentry)->i_mode) &&
+ !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) {
ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n",
name, req->CreateOptions);
rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY;
@@ -2774,7 +2801,7 @@ int smb2_open(struct ksmbd_work *work)
if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
!(req->CreateDisposition == FILE_CREATE_LE) &&
- !S_ISDIR(stat.mode)) {
+ !S_ISDIR(d_inode(path.dentry)->i_mode)) {
rsp->hdr.Status = STATUS_NOT_A_DIRECTORY;
rc = -EIO;
goto err_out;
@@ -2918,7 +2945,6 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
}
- fp->filename = name;
fp->cdoption = req->CreateDisposition;
fp->daccess = daccess;
fp->saccess = req->ShareAccess;
@@ -2969,7 +2995,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
rc = build_sec_desc(user_ns,
- pntsd, NULL,
+ pntsd, NULL, 0,
OWNER_SECINFO |
GROUP_SECINFO |
DACL_SECINFO,
@@ -3022,12 +3048,6 @@ int smb2_open(struct ksmbd_work *work)
list_add(&fp->node, &fp->f_ci->m_fp_list);
write_unlock(&fp->f_ci->m_lock);
- rc = ksmbd_vfs_getattr(&path, &stat);
- if (rc) {
- generic_fillattr(user_ns, d_inode(path.dentry), &stat);
- rc = 0;
- }
-
/* Check delete pending among previous fp before oplock break */
if (ksmbd_inode_pending_delete(fp)) {
rc = -EBUSY;
@@ -3114,6 +3134,10 @@ int smb2_open(struct ksmbd_work *work)
}
}
+ rc = ksmbd_vfs_getattr(&path, &stat);
+ if (rc)
+ goto err_out;
+
if (stat.result_mask & STATX_BTIME)
fp->create_time = ksmbd_UnixTimeToNT(stat.btime);
else
@@ -3129,9 +3153,6 @@ int smb2_open(struct ksmbd_work *work)
memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
- generic_fillattr(user_ns, file_inode(fp->filp),
- &stat);
-
rsp->StructureSize = cpu_to_le16(89);
rcu_read_lock();
opinfo = rcu_dereference(fp->f_opinfo);
@@ -3270,14 +3291,13 @@ err_out1:
if (!rsp->hdr.Status)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
- if (!fp || !fp->filename)
- kfree(name);
if (fp)
ksmbd_fd_put(work, fp);
smb2_set_err_rsp(work);
ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status);
}
+ kfree(name);
kfree(lc);
return 0;
@@ -3545,17 +3565,22 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9);
posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev);
posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink);
- posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode);
+ posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode & 0777);
posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino);
posix_info->DosAttributes =
S_ISDIR(ksmbd_kstat->kstat->mode) ?
FILE_ATTRIBUTE_DIRECTORY_LE : FILE_ATTRIBUTE_ARCHIVE_LE;
if (d_info->hide_dot_file && d_info->name[0] == '.')
posix_info->DosAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
+ /*
+ * SidBuffer(32) contain two sids(Domain sid(16), UNIX group sid(16)).
+ * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid),
- SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
+ SIDUNIX_USER, (struct smb_sid *)&posix_info->SidBuffer[0]);
id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid),
- SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]);
+ SIDUNIX_GROUP, (struct smb_sid *)&posix_info->SidBuffer[16]);
memcpy(posix_info->name, conv_name, conv_len);
posix_info->name_len = cpu_to_le32(conv_len);
posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset);
@@ -3760,7 +3785,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
return 0;
}
-static int __query_dir(struct dir_context *ctx, const char *name, int namlen,
+static bool __query_dir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
@@ -3774,27 +3799,20 @@ static int __query_dir(struct dir_context *ctx, const char *name, int namlen,
/* dot and dotdot entries are already reserved */
if (!strcmp(".", name) || !strcmp("..", name))
- return 0;
+ return true;
if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name))
- return 0;
+ return true;
if (!match_pattern(name, namlen, priv->search_pattern))
- return 0;
+ return true;
d_info->name = name;
d_info->name_len = namlen;
rc = reserve_populate_dentry(d_info, priv->info_level);
if (rc)
- return rc;
- if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) {
+ return false;
+ if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY)
d_info->out_buf_len = 0;
- return 0;
- }
- return 0;
-}
-
-static void restart_ctx(struct dir_context *ctx)
-{
- ctx->pos = 0;
+ return true;
}
static int verify_info_level(int info_level)
@@ -3815,6 +3833,15 @@ static int verify_info_level(int info_level)
return 0;
}
+static int smb2_resp_buf_len(struct ksmbd_work *work, unsigned short hdr2_len)
+{
+ int free_len;
+
+ free_len = (int)(work->response_sz -
+ (get_rfc1002_len(work->response_buf) + 4)) - hdr2_len;
+ return free_len;
+}
+
static int smb2_calc_max_out_buf_len(struct ksmbd_work *work,
unsigned short hdr2_len,
unsigned int out_buf_len)
@@ -3824,9 +3851,7 @@ static int smb2_calc_max_out_buf_len(struct ksmbd_work *work,
if (out_buf_len > work->conn->vals->max_trans_size)
return -EINVAL;
- free_len = (int)(work->response_sz -
- (get_rfc1002_len(work->response_buf) + 4)) -
- hdr2_len;
+ free_len = smb2_resp_buf_len(work, hdr2_len);
if (free_len < 0)
return -EINVAL;
@@ -3871,8 +3896,7 @@ int smb2_query_dir(struct ksmbd_work *work)
inode_permission(file_mnt_user_ns(dir_fp->filp),
file_inode(dir_fp->filp),
MAY_READ | MAY_EXEC)) {
- pr_err("no right to enumerate directory (%pd)\n",
- dir_fp->filp->f_path.dentry);
+ pr_err("no right to enumerate directory (%pD)\n", dir_fp->filp);
rc = -EACCES;
goto err_out2;
}
@@ -3895,12 +3919,9 @@ int smb2_query_dir(struct ksmbd_work *work)
ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr);
}
- ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename);
-
if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) {
ksmbd_debug(SMB, "Restart directory scan\n");
generic_file_llseek(dir_fp->filp, 0, SEEK_SET);
- restart_ctx(&dir_fp->readdir_data.ctx);
}
memset(&d_info, 0, sizeof(struct ksmbd_dir_info));
@@ -3941,11 +3962,15 @@ int smb2_query_dir(struct ksmbd_work *work)
set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir);
rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx);
- if (rc == 0)
- restart_ctx(&dir_fp->readdir_data.ctx);
- if (rc == -ENOSPC)
+ /*
+ * req->OutputBufferLength is too small to contain even one entry.
+ * In this case, it immediately returns OutputBufferLength 0 to client.
+ */
+ if (!d_info.out_buf_len && !d_info.num_entry)
+ goto no_buf_len;
+ if (rc > 0 || rc == -ENOSPC)
rc = 0;
- if (rc)
+ else if (rc)
goto err_out;
d_info.wptr = d_info.rptr;
@@ -3967,10 +3992,12 @@ int smb2_query_dir(struct ksmbd_work *work)
rsp->Buffer[0] = 0;
inc_rfc1001_len(work->response_buf, 9);
} else {
+no_buf_len:
((struct file_directory_info *)
((char *)rsp->Buffer + d_info.last_entry_offset))
->NextEntryOffset = 0;
- d_info.data_count -= d_info.last_entry_off_align;
+ if (d_info.data_count >= d_info.last_entry_off_align)
+ d_info.data_count -= d_info.last_entry_off_align;
rsp->StructureSize = cpu_to_le16(9);
rsp->OutputBufferOffset = cpu_to_le16(72);
@@ -4000,6 +4027,8 @@ err_out2:
rsp->hdr.Status = STATUS_NO_MEMORY;
else if (rc == -EFAULT)
rsp->hdr.Status = STATUS_INVALID_INFO_CLASS;
+ else if (rc == -EIO)
+ rsp->hdr.Status = STATUS_FILE_CORRUPT_ERROR;
if (!rsp->hdr.Status)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
@@ -4129,7 +4158,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
int rc, name_len, value_len, xattr_list_len, idx;
ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0;
struct smb2_ea_info_req *ea_req = NULL;
- struct path *path;
+ const struct path *path;
struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
if (!(fp->daccess & FILE_READ_EA_LE)) {
@@ -4390,9 +4419,9 @@ static int get_file_all_info(struct ksmbd_work *work,
return -EACCES;
}
- filename = convert_to_nt_pathname(fp->filename);
- if (!filename)
- return -ENOMEM;
+ filename = convert_to_nt_pathname(work->tcon->share_conf, &fp->filp->f_path);
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
inode = file_inode(fp->filp);
generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat);
@@ -4466,7 +4495,7 @@ static void get_file_stream_info(struct ksmbd_work *work,
struct smb2_file_stream_info *file_info;
char *stream_name, *xattr_list = NULL, *stream_buf;
struct kstat stat;
- struct path *path = &fp->filp->f_path;
+ const struct path *path = &fp->filp->f_path;
ssize_t xattr_list_len;
int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
int buf_free_len;
@@ -4691,7 +4720,11 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
{
struct smb311_posix_qinfo *file_info;
struct inode *inode = file_inode(fp->filp);
+ struct user_namespace *user_ns = file_mnt_user_ns(fp->filp);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(user_ns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(user_ns, inode);
u64 time;
+ int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32;
file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
file_info->CreationTime = cpu_to_le64(fp->create_time);
@@ -4706,12 +4739,22 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info->EndOfFile = cpu_to_le64(inode->i_size);
file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
file_info->HardLinks = cpu_to_le32(inode->i_nlink);
- file_info->Mode = cpu_to_le32(inode->i_mode);
+ file_info->Mode = cpu_to_le32(inode->i_mode & 0777);
file_info->DeviceId = cpu_to_le32(inode->i_rdev);
- rsp->OutputBufferLength =
- cpu_to_le32(sizeof(struct smb311_posix_qinfo));
- inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo));
- return 0;
+
+ /*
+ * Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)).
+ * UNIX sid(16) = revision(1) + num_subauth(1) + authority(6) +
+ * sub_auth(4 * 1(num_subauth)) + RID(4).
+ */
+ id_to_sid(from_kuid_munged(&init_user_ns, vfsuid_into_kuid(vfsuid)),
+ SIDUNIX_USER, (struct smb_sid *)&file_info->Sids[0]);
+ id_to_sid(from_kgid_munged(&init_user_ns, vfsgid_into_kgid(vfsgid)),
+ SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]);
+
+ rsp->OutputBufferLength = cpu_to_le32(out_buf_len);
+ inc_rfc1001_len(rsp_org, out_buf_len);
+ return out_buf_len;
}
static int smb2_get_info_file(struct ksmbd_work *work,
@@ -4831,8 +4874,8 @@ static int smb2_get_info_file(struct ksmbd_work *work,
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
rc = -EOPNOTSUPP;
} else {
- rc = find_file_posix_info(rsp, fp, work->response_buf);
- file_infoclass_size = sizeof(struct smb311_posix_qinfo);
+ file_infoclass_size = find_file_posix_info(rsp, fp,
+ work->response_buf);
}
break;
default:
@@ -4853,7 +4896,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
struct smb2_query_info_rsp *rsp)
{
struct ksmbd_session *sess = work->sess;
- struct ksmbd_conn *conn = sess->conn;
+ struct ksmbd_conn *conn = work->conn;
struct ksmbd_share_config *share = work->tcon->share_conf;
int fsinfoclass = 0;
struct kstatfs stfs;
@@ -4999,15 +5042,17 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
case FS_SECTOR_SIZE_INFORMATION:
{
struct smb3_fs_ss_info *info;
+ unsigned int sector_size =
+ min_t(unsigned int, path.mnt->mnt_sb->s_blocksize, 4096);
info = (struct smb3_fs_ss_info *)(rsp->Buffer);
- info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize);
+ info->LogicalBytesPerSector = cpu_to_le32(sector_size);
info->PhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
- info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
+ info->PhysicalBytesPerSectorForPerf = cpu_to_le32(sector_size);
info->FSEffPhysicalBytesPerSectorForAtomicity =
- cpu_to_le32(stfs.f_bsize);
+ cpu_to_le32(sector_size);
info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE |
SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE);
info->ByteOffsetForSectorAlignment = 0;
@@ -5081,10 +5126,10 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
struct smb_ntsd *pntsd = (struct smb_ntsd *)rsp->Buffer, *ppntsd = NULL;
struct smb_fattr fattr = {{0}};
struct inode *inode;
- __u32 secdesclen;
+ __u32 secdesclen = 0;
unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
int addition_info = le32_to_cpu(req->AdditionalInformation);
- int rc;
+ int rc = 0, ppntsd_size = 0;
if (addition_info & ~(OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO |
PROTECTED_DACL_SECINFO |
@@ -5130,11 +5175,14 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_ACL_XATTR))
- ksmbd_vfs_get_sd_xattr(work->conn, user_ns,
- fp->filp->f_path.dentry, &ppntsd);
-
- rc = build_sec_desc(user_ns, pntsd, ppntsd, addition_info,
- &secdesclen, &fattr);
+ ppntsd_size = ksmbd_vfs_get_sd_xattr(work->conn, user_ns,
+ fp->filp->f_path.dentry,
+ &ppntsd);
+
+ /* Check if sd buffer size exceeds response buffer size */
+ if (smb2_resp_buf_len(work, 8) > ppntsd_size)
+ rc = build_sec_desc(user_ns, pntsd, ppntsd, ppntsd_size,
+ addition_info, &secdesclen, &fattr);
posix_acl_release(fattr.cf_acls);
posix_acl_release(fattr.cf_dacls);
kfree(ppntsd);
@@ -5379,7 +5427,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (!pathname)
return -ENOMEM;
- abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX);
+ abs_oldname = file_path(fp->filp, pathname, PATH_MAX);
if (IS_ERR(abs_oldname)) {
rc = -EINVAL;
goto out;
@@ -5514,7 +5562,7 @@ static int smb2_create_link(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "link name is %s\n", link_name);
- target_name = d_path(&filp->f_path, pathname, PATH_MAX);
+ target_name = file_path(filp, pathname, PATH_MAX);
if (IS_ERR(target_name)) {
rc = -EINVAL;
goto out;
@@ -5683,8 +5731,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
size = i_size_read(inode);
rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512);
if (rc) {
- pr_err("truncate failed! filename : %s, err %d\n",
- fp->filename, rc);
+ pr_err("truncate failed!, err %d\n", rc);
return rc;
}
if (size < alloc_blks * 512)
@@ -5714,12 +5761,10 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
* truncated range.
*/
if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
- ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n",
- fp->filename, newsize);
+ ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize);
rc = ksmbd_vfs_truncate(work, fp, newsize);
if (rc) {
- ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n",
- fp->filename, rc);
+ ksmbd_debug(SMB, "truncate failed!, err %d\n", rc);
if (rc != -EAGAIN)
rc = -EBADF;
return rc;
@@ -5765,12 +5810,14 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp,
if (parent_fp) {
if (parent_fp->daccess & FILE_DELETE_LE) {
pr_err("parent dir is opened with delete access\n");
+ ksmbd_fd_put(work, parent_fp);
return -ESHARE;
}
+ ksmbd_fd_put(work, parent_fp);
}
next:
return smb2_rename(work, fp, user_ns, rename_info,
- work->sess->conn->local_nls);
+ work->conn->local_nls);
}
static int set_file_disposition_info(struct ksmbd_file *fp,
@@ -5902,7 +5949,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return smb2_create_link(work, work->tcon->share_conf,
(struct smb2_file_link_info *)req->Buffer,
buf_len, fp->filp,
- work->sess->conn->local_nls);
+ work->conn->local_nls);
}
case FILE_DISPOSITION_INFORMATION:
{
@@ -6118,7 +6165,6 @@ out:
static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
struct smb2_buffer_desc_v1 *desc,
__le32 Channel,
- __le16 ChannelInfoOffset,
__le16 ChannelInfoLength)
{
unsigned int i, ch_count;
@@ -6136,15 +6182,13 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
le32_to_cpu(desc[i].length));
}
}
- if (ch_count != 1) {
- ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n",
- ch_count);
+ if (!ch_count)
return -EINVAL;
- }
work->need_invalidate_rkey =
(Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
- work->remote_key = le32_to_cpu(desc->token);
+ if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE)
+ work->remote_key = le32_to_cpu(desc->token);
return 0;
}
@@ -6152,14 +6196,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
struct smb2_read_req *req, void *data_buf,
size_t length)
{
- struct smb2_buffer_desc_v1 *desc =
- (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
int err;
err = ksmbd_conn_rdma_write(work->conn, data_buf, length,
- le32_to_cpu(desc->token),
- le64_to_cpu(desc->offset),
- le32_to_cpu(desc->length));
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)),
+ le16_to_cpu(req->ReadChannelInfoLength));
if (err)
return err;
@@ -6182,6 +6224,8 @@ int smb2_read(struct ksmbd_work *work)
size_t length, mincount;
ssize_t nbytes = 0, remain_bytes = 0;
int err = 0;
+ bool is_rdma_channel = false;
+ unsigned int max_read_size = conn->vals->max_read_size;
WORK_BUFFERS(work, req, rsp);
@@ -6193,6 +6237,11 @@ int smb2_read(struct ksmbd_work *work)
if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
req->Channel == SMB2_CHANNEL_RDMA_V1) {
+ is_rdma_channel = true;
+ max_read_size = get_smbd_max_read_write_size();
+ }
+
+ if (is_rdma_channel == true) {
unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset);
if (ch_offset < offsetof(struct smb2_read_req, Buffer)) {
@@ -6203,7 +6252,6 @@ int smb2_read(struct ksmbd_work *work)
(struct smb2_buffer_desc_v1 *)
((char *)req + ch_offset),
req->Channel,
- req->ReadChannelInfoOffset,
req->ReadChannelInfoLength);
if (err)
goto out;
@@ -6225,15 +6273,15 @@ int smb2_read(struct ksmbd_work *work)
length = le32_to_cpu(req->Length);
mincount = le32_to_cpu(req->MinimumCount);
- if (length > conn->vals->max_read_size) {
+ if (length > max_read_size) {
ksmbd_debug(SMB, "limiting read size to max size(%u)\n",
- conn->vals->max_read_size);
+ max_read_size);
err = -EINVAL;
goto out;
}
- ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
- fp->filp->f_path.dentry, offset, length);
+ ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n",
+ fp->filp, offset, length);
work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
if (!work->aux_payload_buf) {
@@ -6259,8 +6307,7 @@ int smb2_read(struct ksmbd_work *work)
ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n",
nbytes, offset, mincount);
- if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
- req->Channel == SMB2_CHANNEL_RDMA_V1) {
+ if (is_rdma_channel == true) {
/* write data to the client using rdma channel */
remain_bytes = smb2_read_rdma_channel(work, req,
work->aux_payload_buf,
@@ -6330,23 +6377,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
length = le32_to_cpu(req->Length);
id = req->VolatileFileId;
- if (le16_to_cpu(req->DataOffset) ==
- offsetof(struct smb2_write_req, Buffer)) {
- data_buf = (char *)&req->Buffer[0];
- } else {
- if ((u64)le16_to_cpu(req->DataOffset) + length >
- get_rfc1002_len(work->request_buf)) {
- pr_err("invalid write data offset %u, smb_len %u\n",
- le16_to_cpu(req->DataOffset),
- get_rfc1002_len(work->request_buf));
- err = -EINVAL;
- goto out;
- }
-
- data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
- le16_to_cpu(req->DataOffset));
+ if ((u64)le16_to_cpu(req->DataOffset) + length >
+ get_rfc1002_len(work->request_buf)) {
+ pr_err("invalid write data offset %u, smb_len %u\n",
+ le16_to_cpu(req->DataOffset),
+ get_rfc1002_len(work->request_buf));
+ err = -EINVAL;
+ goto out;
}
+ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+ le16_to_cpu(req->DataOffset));
+
rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length);
if (rpc_resp) {
if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) {
@@ -6386,21 +6428,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
struct ksmbd_file *fp,
loff_t offset, size_t length, bool sync)
{
- struct smb2_buffer_desc_v1 *desc;
char *data_buf;
int ret;
ssize_t nbytes;
- desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
-
data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
if (!data_buf)
return -ENOMEM;
ret = ksmbd_conn_rdma_read(work->conn, data_buf, length,
- le32_to_cpu(desc->token),
- le64_to_cpu(desc->offset),
- le32_to_cpu(desc->length));
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)),
+ le16_to_cpu(req->WriteChannelInfoLength));
if (ret < 0) {
kvfree(data_buf);
return ret;
@@ -6429,8 +6468,9 @@ int smb2_write(struct ksmbd_work *work)
size_t length;
ssize_t nbytes;
char *data_buf;
- bool writethrough = false;
+ bool writethrough = false, is_rdma_channel = false;
int err = 0;
+ unsigned int max_write_size = work->conn->vals->max_write_size;
WORK_BUFFERS(work, req, rsp);
@@ -6439,8 +6479,17 @@ int smb2_write(struct ksmbd_work *work)
return smb2_write_pipe(work);
}
+ offset = le64_to_cpu(req->Offset);
+ length = le32_to_cpu(req->Length);
+
if (req->Channel == SMB2_CHANNEL_RDMA_V1 ||
req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
+ is_rdma_channel = true;
+ max_write_size = get_smbd_max_read_write_size();
+ length = le32_to_cpu(req->RemainingBytes);
+ }
+
+ if (is_rdma_channel == true) {
unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset);
if (req->Length != 0 || req->DataOffset != 0 ||
@@ -6452,7 +6501,6 @@ int smb2_write(struct ksmbd_work *work)
(struct smb2_buffer_desc_v1 *)
((char *)req + ch_offset),
req->Channel,
- req->WriteChannelInfoOffset,
req->WriteChannelInfoLength);
if (err)
goto out;
@@ -6476,44 +6524,29 @@ int smb2_write(struct ksmbd_work *work)
goto out;
}
- offset = le64_to_cpu(req->Offset);
- length = le32_to_cpu(req->Length);
-
- if (length > work->conn->vals->max_write_size) {
+ if (length > max_write_size) {
ksmbd_debug(SMB, "limiting write size to max size(%u)\n",
- work->conn->vals->max_write_size);
+ max_write_size);
err = -EINVAL;
goto out;
}
+ ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
writethrough = true;
- if (req->Channel != SMB2_CHANNEL_RDMA_V1 &&
- req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
- if (le16_to_cpu(req->DataOffset) ==
+ if (is_rdma_channel == false) {
+ if (le16_to_cpu(req->DataOffset) <
offsetof(struct smb2_write_req, Buffer)) {
- data_buf = (char *)&req->Buffer[0];
- } else {
- if ((u64)le16_to_cpu(req->DataOffset) + length >
- get_rfc1002_len(work->request_buf)) {
- pr_err("invalid write data offset %u, smb_len %u\n",
- le16_to_cpu(req->DataOffset),
- get_rfc1002_len(work->request_buf));
- err = -EINVAL;
- goto out;
- }
-
- data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
- le16_to_cpu(req->DataOffset));
+ err = -EINVAL;
+ goto out;
}
- ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
- if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
- writethrough = true;
+ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+ le16_to_cpu(req->DataOffset));
- ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
- fp->filp->f_path.dentry, offset, length);
+ ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n",
+ fp->filp, offset, length);
err = ksmbd_vfs_write(work, fp, data_buf, length, &offset,
writethrough, &nbytes);
if (err < 0)
@@ -6522,8 +6555,7 @@ int smb2_write(struct ksmbd_work *work)
/* read data from the client using rdma channel, and
* write the data.
*/
- nbytes = smb2_write_rdma_channel(work, req, fp, offset,
- le32_to_cpu(req->RemainingBytes),
+ nbytes = smb2_write_rdma_channel(work, req, fp, offset, length,
writethrough);
if (nbytes < 0) {
err = (int)nbytes;
@@ -7625,11 +7657,16 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- if (in_buf_len < sizeof(struct validate_negotiate_info_req))
- return -EINVAL;
+ if (in_buf_len < offsetof(struct validate_negotiate_info_req,
+ Dialects)) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (out_buf_len < sizeof(struct validate_negotiate_info_rsp))
- return -EINVAL;
+ if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) {
+ ret = -EINVAL;
+ goto out;
+ }
ret = fsctl_validate_negotiate_info(conn,
(struct validate_negotiate_info_req *)&req->Buffer[0],
@@ -7707,7 +7744,7 @@ int smb2_ioctl(struct ksmbd_work *work)
{
struct file_zero_data_information *zero_data;
struct ksmbd_file *fp;
- loff_t off, len;
+ loff_t off, len, bfz;
if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
ksmbd_debug(SMB,
@@ -7724,19 +7761,26 @@ int smb2_ioctl(struct ksmbd_work *work)
zero_data =
(struct file_zero_data_information *)&req->Buffer[0];
- fp = ksmbd_lookup_fd_fast(work, id);
- if (!fp) {
- ret = -ENOENT;
+ off = le64_to_cpu(zero_data->FileOffset);
+ bfz = le64_to_cpu(zero_data->BeyondFinalZero);
+ if (off > bfz) {
+ ret = -EINVAL;
goto out;
}
- off = le64_to_cpu(zero_data->FileOffset);
- len = le64_to_cpu(zero_data->BeyondFinalZero) - off;
+ len = bfz - off;
+ if (len) {
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp) {
+ ret = -ENOENT;
+ goto out;
+ }
- ret = ksmbd_vfs_zero_data(work, fp, off, len);
- ksmbd_fd_put(work, fp);
- if (ret < 0)
- goto out;
+ ret = ksmbd_vfs_zero_data(work, fp, off, len);
+ ksmbd_fd_put(work, fp);
+ if (ret < 0)
+ goto out;
+ }
break;
}
case FSCTL_QUERY_ALLOCATED_RANGES:
@@ -7810,14 +7854,24 @@ int smb2_ioctl(struct ksmbd_work *work)
src_off = le64_to_cpu(dup_ext->SourceFileOffset);
dst_off = le64_to_cpu(dup_ext->TargetFileOffset);
length = le64_to_cpu(dup_ext->ByteCount);
- cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp,
- dst_off, length, 0);
+ /*
+ * XXX: It is not clear if FSCTL_DUPLICATE_EXTENTS_TO_FILE
+ * should fall back to vfs_copy_file_range(). This could be
+ * beneficial when re-exporting nfs/smb mount, but note that
+ * this can result in partial copy that returns an error status.
+ * If/when FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX is implemented,
+ * fall back to vfs_copy_file_range(), should be avoided when
+ * the flag DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC is set.
+ */
+ cloned = vfs_clone_file_range(fp_in->filp, src_off,
+ fp_out->filp, dst_off, length, 0);
if (cloned == -EXDEV || cloned == -EOPNOTSUPP) {
ret = -EOPNOTSUPP;
goto dup_ext_out;
} else if (cloned != length) {
cloned = vfs_copy_file_range(fp_in->filp, src_off,
- fp_out->filp, dst_off, length, 0);
+ fp_out->filp, dst_off,
+ length, 0);
if (cloned != length) {
if (cloned < 0)
ret = cloned;
@@ -8346,10 +8400,14 @@ int smb3_check_sign_req(struct ksmbd_work *work)
if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
signing_key = work->sess->smb3signingkey;
} else {
+ read_lock(&work->sess->chann_lock);
chann = lookup_chann_list(work->sess, conn);
- if (!chann)
+ if (!chann) {
+ read_unlock(&work->sess->chann_lock);
return 0;
+ }
signing_key = chann->smb3signingkey;
+ read_unlock(&work->sess->chann_lock);
}
if (!signing_key) {
@@ -8409,10 +8467,14 @@ void smb3_set_sign_rsp(struct ksmbd_work *work)
le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
signing_key = work->sess->smb3signingkey;
} else {
+ read_lock(&work->sess->chann_lock);
chann = lookup_chann_list(work->sess, work->conn);
- if (!chann)
+ if (!chann) {
+ read_unlock(&work->sess->chann_lock);
return;
+ }
signing_key = chann->smb3signingkey;
+ read_unlock(&work->sess->chann_lock);
}
if (!signing_key)
@@ -8530,7 +8592,7 @@ int smb3_encrypt_resp(struct ksmbd_work *work)
buf_size += iov[1].iov_len;
work->resp_hdr_sz = iov[1].iov_len;
- rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1);
+ rc = ksmbd_crypt_message(work, iov, rq_nvec, 1);
if (rc)
return rc;
@@ -8549,7 +8611,6 @@ bool smb3_is_transform_hdr(void *buf)
int smb3_decrypt_req(struct ksmbd_work *work)
{
- struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess;
char *buf = work->request_buf;
unsigned int pdu_length = get_rfc1002_len(buf);
@@ -8569,7 +8630,7 @@ int smb3_decrypt_req(struct ksmbd_work *work)
return -ECONNABORTED;
}
- sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId));
+ sess = ksmbd_session_lookup_all(work->conn, le64_to_cpu(tr_hdr->SessionId));
if (!sess) {
pr_err("invalid session id(%llx) in transform header\n",
le64_to_cpu(tr_hdr->SessionId));
@@ -8580,7 +8641,7 @@ int smb3_decrypt_req(struct ksmbd_work *work)
iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4;
iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr) + 4;
iov[1].iov_len = buf_data_size;
- rc = ksmbd_crypt_message(conn, iov, 2, 0);
+ rc = ksmbd_crypt_message(work, iov, 2, 0);
if (rc)
return rc;
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index af455278d005..092fdd3f8750 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -158,7 +158,8 @@ struct create_posix_rsp {
__le32 nlink;
__le32 reparse_tag;
__le32 mode;
- u8 SidBuffer[40];
+ /* SidBuffer contain two sids(Domain sid(28), UNIX group sid(16)) */
+ u8 SidBuffer[44];
} __packed;
struct smb2_buffer_desc_v1 {
@@ -439,7 +440,8 @@ struct smb2_posix_info {
__le32 HardLinks;
__le32 ReparseTag;
__le32 Mode;
- u8 SidBuffer[40];
+ /* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */
+ u8 SidBuffer[32];
__le32 name_len;
u8 name[1];
/*
@@ -492,6 +494,7 @@ int smb3_decrypt_req(struct ksmbd_work *work);
int smb3_encrypt_resp(struct ksmbd_work *work);
bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work);
int smb2_set_rsp_credits(struct ksmbd_work *work);
+bool smb3_encryption_negotiated(struct ksmbd_conn *conn);
/* smb2 misc functions */
int ksmbd_smb2_check_message(struct ksmbd_work *work);
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index 9a7e211dbf4f..d96da872d70a 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -4,6 +4,8 @@
* Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org>
*/
+#include <linux/user_namespace.h>
+
#include "smb_common.h"
#include "server.h"
#include "misc.h"
@@ -140,8 +142,10 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work)
hdr = work->request_buf;
if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER &&
- hdr->Command == SMB_COM_NEGOTIATE)
+ hdr->Command == SMB_COM_NEGOTIATE) {
+ work->conn->outstanding_credits++;
return 0;
+ }
return -EINVAL;
}
@@ -623,8 +627,8 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
if (!cred)
return -ENOMEM;
- cred->fsuid = make_kuid(current_user_ns(), uid);
- cred->fsgid = make_kgid(current_user_ns(), gid);
+ cred->fsuid = make_kuid(&init_user_ns, uid);
+ cred->fsgid = make_kgid(&init_user_ns, gid);
gi = groups_alloc(0);
if (!gi) {
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index e1369b4345a9..318c16fa81da 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -421,7 +421,7 @@ struct smb_version_ops {
int (*check_sign_req)(struct ksmbd_work *work);
void (*set_sign_rsp)(struct ksmbd_work *work);
int (*generate_signingkey)(struct ksmbd_session *sess, struct ksmbd_conn *conn);
- int (*generate_encryptionkey)(struct ksmbd_session *sess);
+ int (*generate_encryptionkey)(struct ksmbd_conn *conn, struct ksmbd_session *sess);
bool (*is_transform_hdr)(void *buf);
int (*decrypt_req)(struct ksmbd_work *work);
int (*encrypt_resp)(struct ksmbd_work *work);
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index 6ecf55ea1fed..b05ff9b146b5 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -275,7 +275,8 @@ static int sid_to_id(struct user_namespace *user_ns,
uid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- uid = mapped_kuid_user(user_ns, &init_user_ns, KUIDT_INIT(id));
+ uid = KUIDT_INIT(id);
+ uid = from_vfsuid(user_ns, &init_user_ns, VFSUIDT_INIT(uid));
if (uid_valid(uid)) {
fattr->cf_uid = uid;
rc = 0;
@@ -285,7 +286,8 @@ static int sid_to_id(struct user_namespace *user_ns,
gid_t id;
id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]);
- gid = mapped_kgid_user(user_ns, &init_user_ns, KGIDT_INIT(id));
+ gid = KGIDT_INIT(id);
+ gid = from_vfsgid(user_ns, &init_user_ns, VFSGIDT_INIT(gid));
if (gid_valid(gid)) {
fattr->cf_gid = gid;
rc = 0;
@@ -690,6 +692,7 @@ posix_default_acl:
static void set_ntacl_dacl(struct user_namespace *user_ns,
struct smb_acl *pndacl,
struct smb_acl *nt_dacl,
+ unsigned int aces_size,
const struct smb_sid *pownersid,
const struct smb_sid *pgrpsid,
struct smb_fattr *fattr)
@@ -703,9 +706,19 @@ static void set_ntacl_dacl(struct user_namespace *user_ns,
if (nt_num_aces) {
ntace = (struct smb_ace *)((char *)nt_dacl + sizeof(struct smb_acl));
for (i = 0; i < nt_num_aces; i++) {
- memcpy((char *)pndace + size, ntace, le16_to_cpu(ntace->size));
- size += le16_to_cpu(ntace->size);
- ntace = (struct smb_ace *)((char *)ntace + le16_to_cpu(ntace->size));
+ unsigned short nt_ace_size;
+
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+
+ nt_ace_size = le16_to_cpu(ntace->size);
+ if (nt_ace_size > aces_size)
+ break;
+
+ memcpy((char *)pndace + size, ntace, nt_ace_size);
+ size += nt_ace_size;
+ aces_size -= nt_ace_size;
+ ntace = (struct smb_ace *)((char *)ntace + nt_ace_size);
num_aces++;
}
}
@@ -878,7 +891,7 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
/* Convert permission bits from mode to equivalent CIFS ACL */
int build_sec_desc(struct user_namespace *user_ns,
struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd,
- int addition_info, __u32 *secdesclen,
+ int ppntsd_size, int addition_info, __u32 *secdesclen,
struct smb_fattr *fattr)
{
int rc = 0;
@@ -938,15 +951,25 @@ int build_sec_desc(struct user_namespace *user_ns,
if (!ppntsd) {
set_mode_dacl(user_ns, dacl_ptr, fattr);
- } else if (!ppntsd->dacloffset) {
- goto out;
} else {
struct smb_acl *ppdacl_ptr;
+ unsigned int dacl_offset = le32_to_cpu(ppntsd->dacloffset);
+ int ppdacl_size, ntacl_size = ppntsd_size - dacl_offset;
+
+ if (!dacl_offset ||
+ (dacl_offset + sizeof(struct smb_acl) > ppntsd_size))
+ goto out;
+
+ ppdacl_ptr = (struct smb_acl *)((char *)ppntsd + dacl_offset);
+ ppdacl_size = le16_to_cpu(ppdacl_ptr->size);
+ if (ppdacl_size > ntacl_size ||
+ ppdacl_size < sizeof(struct smb_acl))
+ goto out;
- ppdacl_ptr = (struct smb_acl *)((char *)ppntsd +
- le32_to_cpu(ppntsd->dacloffset));
set_ntacl_dacl(user_ns, dacl_ptr, ppdacl_ptr,
- nowner_sid_ptr, ngroup_sid_ptr, fattr);
+ ntacl_size - sizeof(struct smb_acl),
+ nowner_sid_ptr, ngroup_sid_ptr,
+ fattr);
}
pntsd->dacloffset = cpu_to_le32(offset);
offset += le16_to_cpu(dacl_ptr->size);
@@ -970,7 +993,7 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type,
}
int smb_inherit_dacl(struct ksmbd_conn *conn,
- struct path *path,
+ const struct path *path,
unsigned int uid, unsigned int gid)
{
const struct smb_sid *psid, *creator = NULL;
@@ -980,24 +1003,31 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
struct smb_sid owner_sid, group_sid;
struct dentry *parent = path->dentry->d_parent;
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
- int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0;
- int rc = 0, num_aces, dacloffset, pntsd_type, acl_len;
+ int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size;
+ int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size;
char *aces_base;
bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode);
- acl_len = ksmbd_vfs_get_sd_xattr(conn, user_ns,
- parent, &parent_pntsd);
- if (acl_len <= 0)
+ pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns,
+ parent, &parent_pntsd);
+ if (pntsd_size <= 0)
return -ENOENT;
dacloffset = le32_to_cpu(parent_pntsd->dacloffset);
- if (!dacloffset) {
+ if (!dacloffset || (dacloffset + sizeof(struct smb_acl) > pntsd_size)) {
rc = -EINVAL;
goto free_parent_pntsd;
}
parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset);
+ acl_len = pntsd_size - dacloffset;
num_aces = le32_to_cpu(parent_pdacl->num_aces);
pntsd_type = le16_to_cpu(parent_pntsd->type);
+ pdacl_size = le16_to_cpu(parent_pdacl->size);
+
+ if (pdacl_size > acl_len || pdacl_size < sizeof(struct smb_acl)) {
+ rc = -EINVAL;
+ goto free_parent_pntsd;
+ }
aces_base = kmalloc(sizeof(struct smb_ace) * num_aces * 2, GFP_KERNEL);
if (!aces_base) {
@@ -1008,11 +1038,23 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
aces = (struct smb_ace *)aces_base;
parent_aces = (struct smb_ace *)((char *)parent_pdacl +
sizeof(struct smb_acl));
+ aces_size = acl_len - sizeof(struct smb_acl);
if (pntsd_type & DACL_AUTO_INHERITED)
inherited_flags = INHERITED_ACE;
for (i = 0; i < num_aces; i++) {
+ int pace_size;
+
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+
+ pace_size = le16_to_cpu(parent_aces->size);
+ if (pace_size > aces_size)
+ break;
+
+ aces_size -= pace_size;
+
flags = parent_aces->flags;
if (!smb_inherit_flags(flags, is_dir))
goto pass;
@@ -1057,8 +1099,7 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size));
ace_cnt++;
pass:
- parent_aces =
- (struct smb_ace *)((char *)parent_aces + le16_to_cpu(parent_aces->size));
+ parent_aces = (struct smb_ace *)((char *)parent_aces + pace_size);
}
if (nt_size > 0) {
@@ -1146,14 +1187,14 @@ bool smb_inherit_flags(int flags, bool is_dir)
return false;
}
-int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
__le32 *pdaccess, int uid)
{
struct user_namespace *user_ns = mnt_user_ns(path->mnt);
struct smb_ntsd *pntsd = NULL;
struct smb_acl *pdacl;
struct posix_acl *posix_acls;
- int rc = 0, acl_size;
+ int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size, dacl_offset;
struct smb_sid sid;
int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE);
struct smb_ace *ace;
@@ -1162,37 +1203,33 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
struct smb_ace *others_ace = NULL;
struct posix_acl_entry *pa_entry;
unsigned int sid_type = SIDOWNER;
- char *end_of_acl;
+ unsigned short ace_size;
ksmbd_debug(SMB, "check permission using windows acl\n");
- acl_size = ksmbd_vfs_get_sd_xattr(conn, user_ns,
- path->dentry, &pntsd);
- if (acl_size <= 0 || !pntsd || !pntsd->dacloffset) {
- kfree(pntsd);
- return 0;
- }
+ pntsd_size = ksmbd_vfs_get_sd_xattr(conn, user_ns,
+ path->dentry, &pntsd);
+ if (pntsd_size <= 0 || !pntsd)
+ goto err_out;
+
+ dacl_offset = le32_to_cpu(pntsd->dacloffset);
+ if (!dacl_offset ||
+ (dacl_offset + sizeof(struct smb_acl) > pntsd_size))
+ goto err_out;
pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
- end_of_acl = ((char *)pntsd) + acl_size;
- if (end_of_acl <= (char *)pdacl) {
- kfree(pntsd);
- return 0;
- }
+ acl_size = pntsd_size - dacl_offset;
+ pdacl_size = le16_to_cpu(pdacl->size);
- if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size) ||
- le16_to_cpu(pdacl->size) < sizeof(struct smb_acl)) {
- kfree(pntsd);
- return 0;
- }
+ if (pdacl_size > acl_size || pdacl_size < sizeof(struct smb_acl))
+ goto err_out;
if (!pdacl->num_aces) {
- if (!(le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) &&
+ if (!(pdacl_size - sizeof(struct smb_acl)) &&
*pdaccess & ~(FILE_READ_CONTROL_LE | FILE_WRITE_DAC_LE)) {
rc = -EACCES;
goto err_out;
}
- kfree(pntsd);
- return 0;
+ goto err_out;
}
if (*pdaccess & FILE_MAXIMAL_ACCESS_LE) {
@@ -1200,11 +1237,16 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
DELETE;
ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
+ aces_size = acl_size - sizeof(struct smb_acl);
for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+ ace_size = le16_to_cpu(ace->size);
+ if (ace_size > aces_size)
+ break;
+ aces_size -= ace_size;
granted |= le32_to_cpu(ace->access_req);
ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size));
- if (end_of_acl < (char *)ace)
- goto err_out;
}
if (!pdacl->num_aces)
@@ -1216,7 +1258,15 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
id_to_sid(uid, sid_type, &sid);
ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
+ aces_size = acl_size - sizeof(struct smb_acl);
for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ if (offsetof(struct smb_ace, access_req) > aces_size)
+ break;
+ ace_size = le16_to_cpu(ace->size);
+ if (ace_size > aces_size)
+ break;
+ aces_size -= ace_size;
+
if (!compare_sids(&sid, &ace->sid) ||
!compare_sids(&sid_unix_NFS_mode, &ace->sid)) {
found = 1;
@@ -1226,8 +1276,6 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
others_ace = ace;
ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size));
- if (end_of_acl < (char *)ace)
- goto err_out;
}
if (*pdaccess & FILE_MAXIMAL_ACCESS_LE && found) {
@@ -1261,6 +1309,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
if (!access_bits)
access_bits =
SET_MINIMUM_RIGHTS;
+ posix_acl_release(posix_acls);
goto check_access_bits;
}
}
@@ -1305,7 +1354,7 @@ err_out:
}
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
- struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
bool type_check)
{
int rc;
diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h
index 811af3309429..618f2e0236b3 100644
--- a/fs/ksmbd/smbacl.h
+++ b/fs/ksmbd/smbacl.h
@@ -193,7 +193,7 @@ struct posix_acl_state {
int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
int acl_len, struct smb_fattr *fattr);
int build_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd,
- struct smb_ntsd *ppntsd, int addition_info,
+ struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info,
__u32 *secdesclen, struct smb_fattr *fattr);
int init_acl_state(struct posix_acl_state *state, int cnt);
void free_acl_state(struct posix_acl_state *state);
@@ -201,12 +201,12 @@ void posix_state_to_acl(struct posix_acl_state *state,
struct posix_acl_entry *pace);
int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid);
bool smb_inherit_flags(int flags, bool is_dir);
-int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_inherit_dacl(struct ksmbd_conn *conn, const struct path *path,
unsigned int uid, unsigned int gid);
-int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path,
+int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
__le32 *pdaccess, int uid);
int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
- struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
+ const struct path *path, struct smb_ntsd *pntsd, int ntsd_len,
bool type_check);
void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid);
void ksmbd_init_domain(u32 *sub_auth);
@@ -214,25 +214,25 @@ void ksmbd_init_domain(u32 *sub_auth);
static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns,
struct posix_acl_entry *pace)
{
- kuid_t kuid;
+ vfsuid_t vfsuid;
/* If this is an idmapped mount, apply the idmapping. */
- kuid = mapped_kuid_fs(mnt_userns, &init_user_ns, pace->e_uid);
+ vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pace->e_uid);
/* Translate the kuid into a userspace id ksmbd would see. */
- return from_kuid(&init_user_ns, kuid);
+ return from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid));
}
static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns,
struct posix_acl_entry *pace)
{
- kgid_t kgid;
+ vfsgid_t vfsgid;
/* If this is an idmapped mount, apply the idmapping. */
- kgid = mapped_kgid_fs(mnt_userns, &init_user_ns, pace->e_gid);
+ vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pace->e_gid);
/* Translate the kgid into a userspace id ksmbd would see. */
- return from_kgid(&init_user_ns, kgid);
+ return from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid));
}
#endif /* _SMBACL_H */
diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
index 3ad6881e0f7e..c9aca21637d5 100644
--- a/fs/ksmbd/transport_ipc.c
+++ b/fs/ksmbd/transport_ipc.c
@@ -26,6 +26,7 @@
#include "mgmt/ksmbd_ida.h"
#include "connection.h"
#include "transport_tcp.h"
+#include "transport_rdma.h"
#define IPC_WAIT_TIMEOUT (2 * HZ)
@@ -196,6 +197,7 @@ static struct genl_family ksmbd_genl_family = {
.module = THIS_MODULE,
.ops = ksmbd_genl_ops,
.n_ops = ARRAY_SIZE(ksmbd_genl_ops),
+ .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1,
};
static void ksmbd_nl_init_fixup(void)
@@ -303,6 +305,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
init_smb2_max_trans_size(req->smb2_max_trans);
if (req->smb2_max_credits)
init_smb2_max_credits(req->smb2_max_credits);
+ if (req->smbd_max_io_size)
+ init_smbd_max_io_size(req->smbd_max_io_size);
ret = ksmbd_set_netbios_name(req->netbios_name);
ret |= ksmbd_set_server_string(req->server_string);
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index e646d79554b8..096eda9ef873 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -5,16 +5,6 @@
*
* Author(s): Long Li <longli@microsoft.com>,
* Hyunchul Lee <hyc.lee@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define SUBMOD_NAME "smb_direct"
@@ -42,7 +32,7 @@
/* SMB_DIRECT negotiation timeout in seconds */
#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120
-#define SMB_DIRECT_MAX_SEND_SGES 8
+#define SMB_DIRECT_MAX_SEND_SGES 6
#define SMB_DIRECT_MAX_RECV_SGES 1
/*
@@ -72,17 +62,15 @@ static int smb_direct_receive_credit_max = 255;
static int smb_direct_send_credit_target = 255;
/* The maximum single message size can be sent to remote peer */
-static int smb_direct_max_send_size = 8192;
+static int smb_direct_max_send_size = 1364;
/* The maximum fragmented upper-layer payload receive size supported */
static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
-static int smb_direct_max_receive_size = 8192;
-
-static int smb_direct_max_read_write_size = 524224;
+static int smb_direct_max_receive_size = 1364;
-static int smb_direct_max_outstanding_rw_ops = 8;
+static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE;
static LIST_HEAD(smb_direct_device_list);
static DEFINE_RWLOCK(smb_direct_device_lock);
@@ -147,18 +135,18 @@ struct smb_direct_transport {
atomic_t send_credits;
spinlock_t lock_new_recv_credits;
int new_recv_credits;
- atomic_t rw_avail_ops;
+ int max_rw_credits;
+ int pages_per_rw_credit;
+ atomic_t rw_credits;
wait_queue_head_t wait_send_credits;
- wait_queue_head_t wait_rw_avail_ops;
+ wait_queue_head_t wait_rw_credits;
mempool_t *sendmsg_mempool;
struct kmem_cache *sendmsg_cache;
mempool_t *recvmsg_mempool;
struct kmem_cache *recvmsg_cache;
- wait_queue_head_t wait_send_payload_pending;
- atomic_t send_payload_pending;
wait_queue_head_t wait_send_pending;
atomic_t send_pending;
@@ -208,12 +196,25 @@ struct smb_direct_recvmsg {
struct smb_direct_rdma_rw_msg {
struct smb_direct_transport *t;
struct ib_cqe cqe;
+ int status;
struct completion *completion;
+ struct list_head list;
struct rdma_rw_ctx rw_ctx;
struct sg_table sgt;
struct scatterlist sg_list[];
};
+void init_smbd_max_io_size(unsigned int sz)
+{
+ sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE);
+ smb_direct_max_read_write_size = sz;
+}
+
+unsigned int get_smbd_max_read_write_size(void)
+{
+ return smb_direct_max_read_write_size;
+}
+
static inline int get_buf_page_count(void *buf, int size)
{
return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
@@ -377,7 +378,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
t->reassembly_queue_length = 0;
init_waitqueue_head(&t->wait_reassembly_queue);
init_waitqueue_head(&t->wait_send_credits);
- init_waitqueue_head(&t->wait_rw_avail_ops);
+ init_waitqueue_head(&t->wait_rw_credits);
spin_lock_init(&t->receive_credit_lock);
spin_lock_init(&t->recvmsg_queue_lock);
@@ -386,8 +387,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
spin_lock_init(&t->empty_recvmsg_queue_lock);
INIT_LIST_HEAD(&t->empty_recvmsg_queue);
- init_waitqueue_head(&t->wait_send_payload_pending);
- atomic_set(&t->send_payload_pending, 0);
init_waitqueue_head(&t->wait_send_pending);
atomic_set(&t->send_pending, 0);
@@ -417,8 +416,6 @@ static void free_transport(struct smb_direct_transport *t)
wake_up_interruptible(&t->wait_send_credits);
ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n");
- wait_event(t->wait_send_payload_pending,
- atomic_read(&t->send_payload_pending) == 0);
wait_event(t->wait_send_pending,
atomic_read(&t->send_pending) == 0);
@@ -569,6 +566,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
}
t->negotiation_requested = true;
t->full_packet_received = true;
+ t->status = SMB_DIRECT_CS_CONNECTED;
enqueue_reassembly(t, recvmsg, 0);
wake_up_interruptible(&t->wait_status);
break;
@@ -873,13 +871,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc)
smb_direct_disconnect_rdma_connection(t);
}
- if (sendmsg->num_sge > 1) {
- if (atomic_dec_and_test(&t->send_payload_pending))
- wake_up(&t->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&t->send_pending))
- wake_up(&t->wait_send_pending);
- }
+ if (atomic_dec_and_test(&t->send_pending))
+ wake_up(&t->wait_send_pending);
/* iterate and free the list of messages in reverse. the list's head
* is invalid.
@@ -911,21 +904,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t,
{
int ret;
- if (wr->num_sge > 1)
- atomic_inc(&t->send_payload_pending);
- else
- atomic_inc(&t->send_pending);
-
+ atomic_inc(&t->send_pending);
ret = ib_post_send(t->qp, wr, NULL);
if (ret) {
pr_err("failed to post send: %d\n", ret);
- if (wr->num_sge > 1) {
- if (atomic_dec_and_test(&t->send_payload_pending))
- wake_up(&t->wait_send_payload_pending);
- } else {
- if (atomic_dec_and_test(&t->send_pending))
- wake_up(&t->wait_send_pending);
- }
+ if (atomic_dec_and_test(&t->send_pending))
+ wake_up(&t->wait_send_pending);
smb_direct_disconnect_rdma_connection(t);
}
return ret;
@@ -983,18 +967,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t,
}
static int wait_for_credits(struct smb_direct_transport *t,
- wait_queue_head_t *waitq, atomic_t *credits)
+ wait_queue_head_t *waitq, atomic_t *total_credits,
+ int needed)
{
int ret;
do {
- if (atomic_dec_return(credits) >= 0)
+ if (atomic_sub_return(needed, total_credits) >= 0)
return 0;
- atomic_inc(credits);
+ atomic_add(needed, total_credits);
ret = wait_event_interruptible(*waitq,
- atomic_read(credits) > 0 ||
- t->status != SMB_DIRECT_CS_CONNECTED);
+ atomic_read(total_credits) >= needed ||
+ t->status != SMB_DIRECT_CS_CONNECTED);
if (t->status != SMB_DIRECT_CS_CONNECTED)
return -ENOTCONN;
@@ -1015,7 +1000,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t,
return ret;
}
- return wait_for_credits(t, &t->wait_send_credits, &t->send_credits);
+ return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1);
+}
+
+static int wait_for_rw_credits(struct smb_direct_transport *t, int credits)
+{
+ return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits);
+}
+
+static int calc_rw_credits(struct smb_direct_transport *t,
+ char *buf, unsigned int len)
+{
+ return DIV_ROUND_UP(get_buf_page_count(buf, len),
+ t->pages_per_rw_credit);
}
static int smb_direct_create_header(struct smb_direct_transport *t,
@@ -1086,7 +1083,7 @@ static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nen
int offset, len;
int i = 0;
- if (nentries < get_buf_page_count(buf, size))
+ if (size <= 0 || nentries < get_buf_page_count(buf, size))
return -EINVAL;
offset = offset_in_page(buf);
@@ -1118,7 +1115,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size,
int npages;
npages = get_sg_list(buf, size, sg_list, nentries);
- if (npages <= 0)
+ if (npages < 0)
return -EINVAL;
return ib_dma_map_sg(device, sg_list, npages, dir);
}
@@ -1313,11 +1310,21 @@ done:
* that means all the I/Os have been out and we are good to return
*/
- wait_event(st->wait_send_payload_pending,
- atomic_read(&st->send_payload_pending) == 0);
+ wait_event(st->wait_send_pending,
+ atomic_read(&st->send_pending) == 0);
return ret;
}
+static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t,
+ struct smb_direct_rdma_rw_msg *msg,
+ enum dma_data_direction dir)
+{
+ rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sgt.sgl, msg->sgt.nents, dir);
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+}
+
static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
enum dma_data_direction dir)
{
@@ -1326,19 +1333,14 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
struct smb_direct_transport *t = msg->t;
if (wc->status != IB_WC_SUCCESS) {
+ msg->status = -EIO;
pr_err("read/write error. opcode = %d, status = %s(%d)\n",
wc->opcode, ib_wc_status_msg(wc->status), wc->status);
- smb_direct_disconnect_rdma_connection(t);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ smb_direct_disconnect_rdma_connection(t);
}
- if (atomic_inc_return(&t->rw_avail_ops) > 0)
- wake_up(&t->wait_rw_avail_ops);
-
- rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
- msg->sg_list, msg->sgt.nents, dir);
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
complete(msg->completion);
- kfree(msg);
}
static void read_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -1351,94 +1353,141 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc)
read_write_done(cq, wc, DMA_TO_DEVICE);
}
-static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf,
- int buf_len, u32 remote_key, u64 remote_offset,
- u32 remote_len, bool is_read)
+static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
+ void *buf, int buf_len,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len,
+ bool is_read)
{
- struct smb_direct_rdma_rw_msg *msg;
- int ret;
+ struct smb_direct_rdma_rw_msg *msg, *next_msg;
+ int i, ret;
DECLARE_COMPLETION_ONSTACK(completion);
- struct ib_send_wr *first_wr = NULL;
+ struct ib_send_wr *first_wr;
+ LIST_HEAD(msg_list);
+ char *desc_buf;
+ int credits_needed;
+ unsigned int desc_buf_len;
+ size_t total_length = 0;
+
+ if (t->status != SMB_DIRECT_CS_CONNECTED)
+ return -ENOTCONN;
- ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops);
+ /* calculate needed credits */
+ credits_needed = 0;
+ desc_buf = buf;
+ for (i = 0; i < desc_len / sizeof(*desc); i++) {
+ desc_buf_len = le32_to_cpu(desc[i].length);
+
+ credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len);
+ desc_buf += desc_buf_len;
+ total_length += desc_buf_len;
+ if (desc_buf_len == 0 || total_length > buf_len ||
+ total_length > t->max_rdma_rw_size)
+ return -EINVAL;
+ }
+
+ ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n",
+ is_read ? "read" : "write", buf_len, credits_needed);
+
+ ret = wait_for_rw_credits(t, credits_needed);
if (ret < 0)
return ret;
- /* TODO: mempool */
- msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) +
- sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL);
- if (!msg) {
- atomic_inc(&t->rw_avail_ops);
- return -ENOMEM;
- }
+ /* build rdma_rw_ctx for each descriptor */
+ desc_buf = buf;
+ for (i = 0; i < desc_len / sizeof(*desc); i++) {
+ msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) +
+ sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto out;
+ }
- msg->sgt.sgl = &msg->sg_list[0];
- ret = sg_alloc_table_chained(&msg->sgt,
- get_buf_page_count(buf, buf_len),
- msg->sg_list, SG_CHUNK_SIZE);
- if (ret) {
- atomic_inc(&t->rw_avail_ops);
- kfree(msg);
- return -ENOMEM;
- }
+ desc_buf_len = le32_to_cpu(desc[i].length);
- ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents);
- if (ret <= 0) {
- pr_err("failed to get pages\n");
- goto err;
- }
+ msg->t = t;
+ msg->cqe.done = is_read ? read_done : write_done;
+ msg->completion = &completion;
- ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port,
- msg->sg_list, get_buf_page_count(buf, buf_len),
- 0, remote_offset, remote_key,
- is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
- if (ret < 0) {
- pr_err("failed to init rdma_rw_ctx: %d\n", ret);
- goto err;
+ msg->sgt.sgl = &msg->sg_list[0];
+ ret = sg_alloc_table_chained(&msg->sgt,
+ get_buf_page_count(desc_buf, desc_buf_len),
+ msg->sg_list, SG_CHUNK_SIZE);
+ if (ret) {
+ kfree(msg);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_sg_list(desc_buf, desc_buf_len,
+ msg->sgt.sgl, msg->sgt.orig_nents);
+ if (ret < 0) {
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+ goto out;
+ }
+
+ ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sgt.sgl,
+ get_buf_page_count(desc_buf, desc_buf_len),
+ 0,
+ le64_to_cpu(desc[i].offset),
+ le32_to_cpu(desc[i].token),
+ is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ if (ret < 0) {
+ pr_err("failed to init rdma_rw_ctx: %d\n", ret);
+ sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
+ kfree(msg);
+ goto out;
+ }
+
+ list_add_tail(&msg->list, &msg_list);
+ desc_buf += desc_buf_len;
}
- msg->t = t;
- msg->cqe.done = is_read ? read_done : write_done;
- msg->completion = &completion;
- first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port,
- &msg->cqe, NULL);
+ /* concatenate work requests of rdma_rw_ctxs */
+ first_wr = NULL;
+ list_for_each_entry_reverse(msg, &msg_list, list) {
+ first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port,
+ &msg->cqe, first_wr);
+ }
ret = ib_post_send(t->qp, first_wr, NULL);
if (ret) {
- pr_err("failed to post send wr: %d\n", ret);
- goto err;
+ pr_err("failed to post send wr for RDMA R/W: %d\n", ret);
+ goto out;
}
+ msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list);
wait_for_completion(&completion);
- return 0;
-
-err:
- atomic_inc(&t->rw_avail_ops);
- if (first_wr)
- rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
- msg->sg_list, msg->sgt.nents,
- is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
- sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
- kfree(msg);
+ ret = msg->status;
+out:
+ list_for_each_entry_safe(msg, next_msg, &msg_list, list) {
+ list_del(&msg->list);
+ smb_direct_free_rdma_rw_msg(t, msg,
+ is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+ atomic_add(credits_needed, &t->rw_credits);
+ wake_up(&t->wait_rw_credits);
return ret;
}
-static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf,
- unsigned int buflen, u32 remote_key,
- u64 remote_offset, u32 remote_len)
+static int smb_direct_rdma_write(struct ksmbd_transport *t,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
- remote_key, remote_offset,
- remote_len, false);
+ desc, desc_len, false);
}
-static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf,
- unsigned int buflen, u32 remote_key,
- u64 remote_offset, u32 remote_len)
+static int smb_direct_rdma_read(struct ksmbd_transport *t,
+ void *buf, unsigned int buflen,
+ struct smb2_buffer_desc_v1 *desc,
+ unsigned int desc_len)
{
return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
- remote_key, remote_offset,
- remote_len, true);
+ desc, desc_len, true);
}
static void smb_direct_disconnect(struct ksmbd_transport *t)
@@ -1478,6 +1527,8 @@ static int smb_direct_cm_handler(struct rdma_cm_id *cm_id,
}
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED: {
+ ib_drain_qp(t->qp);
+
t->status = SMB_DIRECT_CS_DISCONNECTED;
wake_up_interruptible(&t->wait_status);
wake_up_interruptible(&t->wait_reassembly_queue);
@@ -1638,41 +1689,57 @@ out_err:
return ret;
}
+static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t)
+{
+ return min_t(unsigned int,
+ t->cm_id->device->attrs.max_fast_reg_page_list_len,
+ 256);
+}
+
static int smb_direct_init_params(struct smb_direct_transport *t,
struct ib_qp_cap *cap)
{
struct ib_device *device = t->cm_id->device;
- int max_send_sges, max_pages, max_rw_wrs, max_send_wrs;
+ int max_send_sges, max_rw_wrs, max_send_wrs;
+ unsigned int max_sge_per_wr, wrs_per_credit;
- /* need 2 more sge. because a SMB_DIRECT header will be mapped,
- * and maybe a send buffer could be not page aligned.
+ /* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
+ * SMB2 response could be mapped.
*/
t->max_send_size = smb_direct_max_send_size;
- max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2;
+ max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3;
if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) {
pr_err("max_send_size %d is too large\n", t->max_send_size);
return -EINVAL;
}
- /*
- * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA
- * read/writes. HCA guarantees at least max_send_sge of sges for
- * a RDMA read/write work request, and if memory registration is used,
- * we need reg_mr, local_inv wrs for each read/write.
+ /* Calculate the number of work requests for RDMA R/W.
+ * The maximum number of pages which can be registered
+ * with one Memory region can be transferred with one
+ * R/W credit. And at least 4 work requests for each credit
+ * are needed for MR registration, RDMA R/W, local & remote
+ * MR invalidation.
*/
t->max_rdma_rw_size = smb_direct_max_read_write_size;
- max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
- max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES);
- max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num,
- max_pages) * 2;
- max_rw_wrs *= smb_direct_max_outstanding_rw_ops;
+ t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t);
+ t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size,
+ (t->pages_per_rw_credit - 1) *
+ PAGE_SIZE);
+
+ max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
+ device->attrs.max_sge_rd);
+ max_sge_per_wr = max_t(unsigned int, max_sge_per_wr,
+ max_send_sges);
+ wrs_per_credit = max_t(unsigned int, 4,
+ DIV_ROUND_UP(t->pages_per_rw_credit,
+ max_sge_per_wr) + 1);
+ max_rw_wrs = t->max_rw_credits * wrs_per_credit;
max_send_wrs = smb_direct_send_credit_target + max_rw_wrs;
if (max_send_wrs > device->attrs.max_cqe ||
max_send_wrs > device->attrs.max_qp_wr) {
- pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n",
- smb_direct_send_credit_target,
- smb_direct_max_outstanding_rw_ops);
+ pr_err("consider lowering send_credit_target = %d\n",
+ smb_direct_send_credit_target);
pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
device->attrs.max_cqe, device->attrs.max_qp_wr);
return -EINVAL;
@@ -1687,11 +1754,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
return -EINVAL;
}
- if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) {
- pr_err("warning: device max_send_sge = %d too small\n",
- device->attrs.max_send_sge);
- return -EINVAL;
- }
if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) {
pr_err("warning: device max_recv_sge = %d too small\n",
device->attrs.max_recv_sge);
@@ -1707,7 +1769,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
t->send_credit_target = smb_direct_send_credit_target;
atomic_set(&t->send_credits, 0);
- atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops);
+ atomic_set(&t->rw_credits, t->max_rw_credits);
t->max_send_size = smb_direct_max_send_size;
t->max_recv_size = smb_direct_max_receive_size;
@@ -1715,12 +1777,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
cap->max_send_wr = max_send_wrs;
cap->max_recv_wr = t->recv_credit_max;
- cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
+ cap->max_send_sge = max_sge_per_wr;
cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
cap->max_inline_data = 0;
- cap->max_rdma_ctxs =
- rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) *
- smb_direct_max_outstanding_rw_ops;
+ cap->max_rdma_ctxs = t->max_rw_credits;
return 0;
}
@@ -1813,7 +1873,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
}
t->send_cq = ib_alloc_cq(t->cm_id->device, t,
- t->send_credit_target, 0, IB_POLL_WORKQUEUE);
+ smb_direct_send_credit_target + cap->max_rdma_ctxs,
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(t->send_cq)) {
pr_err("Can't create RDMA send CQ\n");
ret = PTR_ERR(t->send_cq);
@@ -1822,8 +1883,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
}
t->recv_cq = ib_alloc_cq(t->cm_id->device, t,
- cap->max_send_wr + cap->max_rdma_ctxs,
- 0, IB_POLL_WORKQUEUE);
+ t->recv_credit_max, 0, IB_POLL_WORKQUEUE);
if (IS_ERR(t->recv_cq)) {
pr_err("Can't create RDMA recv CQ\n");
ret = PTR_ERR(t->recv_cq);
@@ -1852,17 +1912,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) {
- int pages_per_mr, mr_count;
-
- pages_per_mr = min_t(int, pages_per_rw,
- t->cm_id->device->attrs.max_fast_reg_page_list_len);
- mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) *
- atomic_read(&t->rw_avail_ops);
- ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count,
- IB_MR_TYPE_MEM_REG, pages_per_mr, 0);
+ ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs,
+ t->max_rw_credits, IB_MR_TYPE_MEM_REG,
+ t->pages_per_rw_credit, 0);
if (ret) {
pr_err("failed to init mr pool count %d pages %d\n",
- mr_count, pages_per_mr);
+ t->max_rw_credits, t->pages_per_rw_credit);
goto err;
}
}
diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h
index 5567d93a6f96..77aee4e5c9dc 100644
--- a/fs/ksmbd/transport_rdma.h
+++ b/fs/ksmbd/transport_rdma.h
@@ -7,6 +7,10 @@
#ifndef __KSMBD_TRANSPORT_RDMA_H__
#define __KSMBD_TRANSPORT_RDMA_H__
+#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024)
+#define SMBD_MIN_IOSIZE (512 * 1024)
+#define SMBD_MAX_IOSIZE (16 * 1024 * 1024)
+
/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */
struct smb_direct_negotiate_req {
__le16 min_version;
@@ -52,10 +56,14 @@ struct smb_direct_data_transfer {
int ksmbd_rdma_init(void);
void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
+void init_smbd_max_io_size(unsigned int sz);
+unsigned int get_smbd_max_read_write_size(void);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
static inline int ksmbd_rdma_destroy(void) { return 0; }
static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
+static inline void init_smbd_max_io_size(unsigned int sz) { }
+static inline unsigned int get_smbd_max_read_write_size(void) { return 0; }
#endif
#endif /* __KSMBD_TRANSPORT_RDMA_H__ */
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 8fef9de787d3..63d55f543bd2 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -230,7 +230,7 @@ static int ksmbd_kthread_fn(void *p)
break;
}
ret = kernel_accept(iface->ksmbd_socket, &client_sk,
- O_NONBLOCK);
+ SOCK_NONBLOCK);
mutex_unlock(&iface->sock_release_lock);
if (ret) {
if (ret == -EAGAIN)
@@ -399,7 +399,8 @@ static int create_socket(struct interface *iface)
ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket);
if (ret) {
- pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret);
+ if (ret != -EAFNOSUPPORT)
+ pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret);
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP,
&ksmbd_socket);
if (ret) {
diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h
index 5593024230ae..076f6034a789 100644
--- a/fs/ksmbd/unicode.h
+++ b/fs/ksmbd/unicode.h
@@ -24,6 +24,7 @@
#include <asm/byteorder.h>
#include <linux/types.h>
#include <linux/nls.h>
+#include <linux/unicode.h>
#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
@@ -69,7 +70,7 @@ char *smb_strndup_from_utf16(const char *src, const int maxlen,
const struct nls_table *codepage);
int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
const struct nls_table *cp, int mapchars);
-char *ksmbd_extract_sharename(char *treename);
+char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename);
#endif
/*
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 9cebb6ba555b..8de970d6146f 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -377,8 +377,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
if (work->conn->connection_type) {
if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
- pr_err("no right to read(%pd)\n",
- fp->filp->f_path.dentry);
+ pr_err("no right to read(%pD)\n", fp->filp);
return -EACCES;
}
}
@@ -398,8 +397,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
nbytes = kernel_read(filp, rbuf, count, pos);
if (nbytes < 0) {
- pr_err("smb read failed for (%s), err = %zd\n",
- fp->filename, nbytes);
+ pr_err("smb read failed, err = %zd\n", nbytes);
return nbytes;
}
@@ -482,15 +480,13 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
char *buf, size_t count, loff_t *pos, bool sync,
ssize_t *written)
{
- struct ksmbd_session *sess = work->sess;
struct file *filp;
loff_t offset = *pos;
int err = 0;
- if (sess->conn->connection_type) {
+ if (work->conn->connection_type) {
if (!(fp->daccess & FILE_WRITE_DATA_LE)) {
- pr_err("no right to write(%pd)\n",
- fp->filp->f_path.dentry);
+ pr_err("no right to write(%pD)\n", fp->filp);
err = -EACCES;
goto out;
}
@@ -529,8 +525,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
if (sync) {
err = vfs_fsync_range(filp, offset, offset + *written, 0);
if (err < 0)
- pr_err("fsync failed for filename = %pd, err = %d\n",
- fp->filp->f_path.dentry, err);
+ pr_err("fsync failed for filename = %pD, err = %d\n",
+ fp->filp, err);
}
out:
@@ -545,7 +541,7 @@ out:
*
* Return: 0 on success, otherwise error
*/
-int ksmbd_vfs_getattr(struct path *path, struct kstat *stat)
+int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
{
int err;
@@ -875,8 +871,7 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work,
err = vfs_truncate(&filp->f_path, size);
if (err)
- pr_err("truncate failed for filename : %s err %d\n",
- fp->filename, err);
+ pr_err("truncate failed, err %d\n", err);
return err;
}
@@ -965,7 +960,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns,
*/
int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
struct dentry *dentry, const char *attr_name,
- const void *attr_value, size_t attr_size, int flags)
+ void *attr_value, size_t attr_size, int flags)
{
int err;
@@ -1017,7 +1012,9 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
off, len);
- return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len);
+ return vfs_fallocate(fp->filp,
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+ off, len);
}
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
@@ -1048,7 +1045,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
*out_count = 0;
end = start + length;
while (start < end && *out_count < in_count) {
- extent_start = f->f_op->llseek(f, start, SEEK_DATA);
+ extent_start = vfs_llseek(f, start, SEEK_DATA);
if (extent_start < 0) {
if (extent_start != -ENXIO)
ret = (int)extent_start;
@@ -1058,7 +1055,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
if (extent_start >= end)
break;
- extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE);
+ extent_end = vfs_llseek(f, extent_start, SEEK_HOLE);
if (extent_end < 0) {
if (extent_end != -ENXIO)
ret = (int)extent_end;
@@ -1106,7 +1103,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns,
return err;
}
-static int __dir_empty(struct dir_context *ctx, const char *name, int namlen,
+static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
@@ -1114,9 +1111,7 @@ static int __dir_empty(struct dir_context *ctx, const char *name, int namlen,
buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
buf->dirent_count++;
- if (buf->dirent_count > 2)
- return -ENOTEMPTY;
- return 0;
+ return buf->dirent_count <= 2;
}
/**
@@ -1143,22 +1138,33 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp)
return err;
}
-static int __caseless_lookup(struct dir_context *ctx, const char *name,
+static bool __caseless_lookup(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ksmbd_readdir_data *buf;
+ int cmp = -EINVAL;
buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
if (buf->used != namlen)
- return 0;
- if (!strncasecmp((char *)buf->private, name, namlen)) {
+ return true;
+ if (IS_ENABLED(CONFIG_UNICODE) && buf->um) {
+ const struct qstr q_buf = {.name = buf->private,
+ .len = buf->used};
+ const struct qstr q_name = {.name = name,
+ .len = namlen};
+
+ cmp = utf8_strncasecmp(buf->um, &q_buf, &q_name);
+ }
+ if (cmp < 0)
+ cmp = strncasecmp((char *)buf->private, name, namlen);
+ if (!cmp) {
memcpy((char *)buf->private, name, namlen);
buf->dirent_count = 1;
- return -EEXIST;
+ return false;
}
- return 0;
+ return true;
}
/**
@@ -1169,7 +1175,8 @@ static int __caseless_lookup(struct dir_context *ctx, const char *name,
*
* Return: 0 on success, otherwise error
*/
-static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
+static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
+ size_t namelen, struct unicode_map *um)
{
int ret;
struct file *dfilp;
@@ -1179,6 +1186,7 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen)
.private = name,
.used = namelen,
.dirent_count = 0,
+ .um = um,
};
dfilp = dentry_open(dir, flags, current_cred());
@@ -1241,7 +1249,8 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
break;
err = ksmbd_vfs_lookup_in_dir(&parent, filename,
- filename_len);
+ filename_len,
+ work->conn->um);
path_put(&parent);
if (err)
goto out;
@@ -1540,6 +1549,11 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
}
*pntsd = acl.sd_buf;
+ if (acl.sd_size < sizeof(struct smb_ntsd)) {
+ pr_err("sd size is invalid\n");
+ goto out_free;
+ }
+
(*pntsd)->osidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->osidoffset) -
NDR_NTSD_OFFSETOF);
(*pntsd)->gsidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->gsidoffset) -
@@ -1739,11 +1753,11 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
*total_size_written = 0;
if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) {
- pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry);
+ pr_err("no right to read(%pD)\n", src_fp->filp);
return -EACCES;
}
if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
- pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry);
+ pr_err("no right to write(%pD)\n", dst_fp->filp);
return -EACCES;
}
@@ -1779,6 +1793,10 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
ret = vfs_copy_file_range(src_fp->filp, src_off,
dst_fp->filp, dst_off, len, 0);
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(src_fp->filp, src_off,
+ dst_fp->filp, dst_off,
+ len, 0);
if (ret < 0)
return ret;
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 8c37aaf936ab..593059ca8511 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -12,6 +12,7 @@
#include <linux/namei.h>
#include <uapi/linux/xattr.h>
#include <linux/posix_acl.h>
+#include <linux/unicode.h>
#include "smbacl.h"
#include "xattr.h"
@@ -60,6 +61,7 @@ struct ksmbd_readdir_data {
unsigned int used;
unsigned int dirent_count;
unsigned int file_attr;
+ struct unicode_map *um;
};
/* ksmbd kstat wrapper to get valid create time when reading dir entry */
@@ -85,7 +87,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id);
int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name);
int ksmbd_vfs_link(struct ksmbd_work *work,
const char *oldname, const char *newname);
-int ksmbd_vfs_getattr(struct path *path, struct kstat *stat);
+int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat);
int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp,
char *newname);
int ksmbd_vfs_truncate(struct ksmbd_work *work,
@@ -109,7 +111,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns,
int attr_name_len);
int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
struct dentry *dentry, const char *attr_name,
- const void *attr_value, size_t attr_size, int flags);
+ void *attr_value, size_t attr_size, int flags);
int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns,
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 29c1db66bd0f..da9163b00350 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -328,7 +328,6 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
kfree(smb_lock);
}
- kfree(fp->filename);
if (ksmbd_stream_fd(fp))
kfree(fp->stream.name);
kmem_cache_free(filp_cache, fp);
@@ -497,6 +496,7 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode)
list_for_each_entry(lfp, &ci->m_fp_list, node) {
if (inode == file_inode(lfp->filp)) {
atomic_dec(&ci->m_count);
+ lfp = ksmbd_fp_get(lfp);
read_unlock(&ci->m_lock);
return lfp;
}
@@ -569,7 +569,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp)
atomic_set(&fp->refcount, 1);
fp->filp = filp;
- fp->conn = work->sess->conn;
+ fp->conn = work->conn;
fp->tcon = work->tcon;
fp->volatile_id = KSMBD_NO_FID;
fp->persistent_id = KSMBD_NO_FID;
diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h
index 36239ce31afd..fcb13413fa8d 100644
--- a/fs/ksmbd/vfs_cache.h
+++ b/fs/ksmbd/vfs_cache.h
@@ -62,7 +62,6 @@ struct ksmbd_inode {
struct ksmbd_file {
struct file *filp;
- char *filename;
u64 persistent_id;
u64 volatile_id;
diff --git a/fs/libfs.c b/fs/libfs.c
index e64bdedef168..31b0ddf01c31 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -539,17 +539,17 @@ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
EXPORT_SYMBOL(simple_setattr);
-static int simple_readpage(struct file *file, struct page *page)
+static int simple_read_folio(struct file *file, struct folio *folio)
{
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
int simple_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct page *page;
@@ -557,7 +557,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -592,7 +592,7 @@ EXPORT_SYMBOL(simple_write_begin);
* should extend on what's done here with a call to mark_inode_dirty() in the
* case that i_size has changed.
*
- * Use *ONLY* with simple_readpage()
+ * Use *ONLY* with simple_read_folio()
*/
static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -628,7 +628,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
* Provides ramfs-style behavior: data in the pagecache, but no writeback.
*/
const struct address_space_operations ram_aops = {
- .readpage = simple_readpage,
+ .read_folio = simple_read_folio,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
.dirty_folio = noop_dirty_folio,
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index f802223e71ab..cdc8e12cdac4 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -164,7 +164,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
host->h_addrbuf = nsm->sm_addrbuf;
host->net = ni->net;
host->h_cred = get_cred(ni->cred);
- strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
+ strscpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
out:
return host;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 176b468a61c7..284b019cb652 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -32,6 +32,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
if (!nlmsvc_ops)
return nlm_lck_denied_nolocks;
+ if (lock->lock_start > OFFSET_MAX ||
+ (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start))))
+ return nlm4_fbig;
+
/* Obtain host handle */
if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
|| (argp->monitor && nsm_monitor(host) < 0))
@@ -50,6 +54,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Set up the missing parts of the file_lock structure */
lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
+ lock->fl.fl_start = (loff_t)lock->lock_start;
+ lock->fl.fl_end = lock->lock_len ?
+ (loff_t)(lock->lock_start + lock->lock_len - 1) :
+ OFFSET_MAX;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
if (!lock->fl.fl_owner) {
@@ -87,6 +95,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_host *host;
struct nlm_file *file;
+ struct nlm_lockowner *test_owner;
__be32 rc = rpc_success;
dprintk("lockd: TEST4 called\n");
@@ -96,6 +105,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+ test_owner = argp->lock.fl.fl_owner;
/* Now check for conflicting locks */
resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
if (resp->status == nlm_drop_reply)
@@ -103,7 +113,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
else
dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
- nlmsvc_release_lockowner(&argp->lock);
+ nlmsvc_put_lockowner(test_owner);
nlmsvc_release_host(host);
nlm_release_file(file);
return rc;
@@ -511,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "NULL",
@@ -520,6 +531,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_testres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+2+No+Rg,
.pc_name = "TEST",
@@ -529,6 +541,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_lockargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "LOCK",
@@ -538,6 +551,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_cancargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "CANCEL",
@@ -547,6 +561,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_unlockargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "UNLOCK",
@@ -556,6 +571,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "GRANTED",
@@ -565,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_MSG",
@@ -574,6 +591,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_lockargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_MSG",
@@ -583,6 +601,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_cancargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_MSG",
@@ -592,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_unlockargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_MSG",
@@ -601,6 +621,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_testargs,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_MSG",
@@ -610,6 +631,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_RES",
@@ -619,6 +641,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_RES",
@@ -628,6 +651,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_RES",
@@ -637,6 +661,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_RES",
@@ -646,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_res,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_RES",
@@ -655,6 +681,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_reboot,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_reboot),
+ .pc_argzero = sizeof(struct nlm_reboot),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "SM_NOTIFY",
@@ -664,6 +691,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "UNUSED",
@@ -673,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "UNUSED",
@@ -682,6 +711,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_void,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "UNUSED",
@@ -691,6 +721,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_shareargs,
.pc_encode = nlm4svc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "SHARE",
@@ -700,6 +731,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_shareargs,
.pc_encode = nlm4svc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "UNSHARE",
@@ -709,6 +741,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_lockargs,
.pc_encode = nlm4svc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "NM_LOCK",
@@ -718,6 +751,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_decode = nlm4svc_decode_notify,
.pc_encode = nlm4svc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "FREE_ALL",
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cb3658ab9b7a..9c1aa75441e1 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -340,7 +340,7 @@ nlmsvc_get_lockowner(struct nlm_lockowner *lockowner)
return lockowner;
}
-static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
+void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
{
if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
return;
@@ -590,7 +590,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
int error;
int mode;
__be32 ret;
- struct nlm_lockowner *test_owner;
dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
nlmsvc_file_inode(file)->i_sb->s_id,
@@ -604,9 +603,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
- /* If there's a conflicting lock, remember to clean up the test lock */
- test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
-
mode = lock_to_openmode(&lock->fl);
error = vfs_test_lock(file->f_file[mode], &lock->fl);
if (error) {
@@ -635,10 +631,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
conflock->fl.fl_end = lock->fl.fl_end;
locks_release_private(&lock->fl);
- /* Clean up the test lock */
- lock->fl.fl_owner = NULL;
- nlmsvc_put_lockowner(test_owner);
-
ret = nlm_lck_denied;
out:
return ret;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 4dc1b40a489a..e35c05e27806 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -116,6 +116,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
struct nlm_args *argp = rqstp->rq_argp;
struct nlm_host *host;
struct nlm_file *file;
+ struct nlm_lockowner *test_owner;
__be32 rc = rpc_success;
dprintk("lockd: TEST called\n");
@@ -125,6 +126,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+ test_owner = argp->lock.fl.fl_owner;
+
/* Now check for conflicting locks */
resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
if (resp->status == nlm_drop_reply)
@@ -133,7 +136,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
dprintk("lockd: TEST status %d vers %d\n",
ntohl(resp->status), rqstp->rq_vers);
- nlmsvc_release_lockowner(&argp->lock);
+ nlmsvc_put_lockowner(test_owner);
nlmsvc_release_host(host);
nlm_release_file(file);
return rc;
@@ -552,6 +555,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "NULL",
@@ -561,6 +565,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_testres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+2+No+Rg,
.pc_name = "TEST",
@@ -570,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_lockargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "LOCK",
@@ -579,6 +585,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_cancargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "CANCEL",
@@ -588,6 +595,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_unlockargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "UNLOCK",
@@ -597,6 +605,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "GRANTED",
@@ -606,6 +615,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_MSG",
@@ -615,6 +625,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_lockargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_MSG",
@@ -624,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_cancargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_MSG",
@@ -633,6 +645,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_unlockargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_MSG",
@@ -642,6 +655,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_testargs,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_MSG",
@@ -651,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "TEST_RES",
@@ -660,6 +675,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "LOCK_RES",
@@ -669,6 +685,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "CANCEL_RES",
@@ -678,6 +695,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNLOCK_RES",
@@ -687,6 +705,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_res,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_res),
+ .pc_argzero = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "GRANTED_RES",
@@ -696,6 +715,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_reboot,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_reboot),
+ .pc_argzero = sizeof(struct nlm_reboot),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "SM_NOTIFY",
@@ -705,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNUSED",
@@ -714,6 +735,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNUSED",
@@ -723,6 +745,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_void,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_void),
+ .pc_argzero = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
.pc_name = "UNUSED",
@@ -732,6 +755,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_shareargs,
.pc_encode = nlmsvc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "SHARE",
@@ -741,6 +765,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_shareargs,
.pc_encode = nlmsvc_encode_shareres,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
.pc_name = "UNSHARE",
@@ -750,6 +775,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_lockargs,
.pc_encode = nlmsvc_encode_res,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
.pc_name = "NM_LOCK",
@@ -759,6 +785,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_decode = nlmsvc_decode_notify,
.pc_encode = nlmsvc_encode_void,
.pc_argsize = sizeof(struct nlm_args),
+ .pc_argzero = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
.pc_name = "FREE_ALL",
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 0a22a2faf552..e1c4617de771 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file)
}
}
-static int nlm_unlock_files(struct nlm_file *file)
+static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
{
struct file_lock lock;
@@ -184,6 +184,7 @@ static int nlm_unlock_files(struct nlm_file *file)
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
+ lock.fl_owner = owner;
if (file->f_file[O_RDONLY] &&
vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
goto out_err;
@@ -225,7 +226,7 @@ again:
if (match(lockhost, host)) {
spin_unlock(&flctx->flc_lock);
- if (nlm_unlock_files(file))
+ if (nlm_unlock_files(file, fl->fl_owner))
return 1;
goto again;
}
@@ -282,11 +283,10 @@ nlm_file_inuse(struct nlm_file *file)
static void nlm_close_files(struct nlm_file *file)
{
- struct file *f;
-
- for (f = file->f_file[0]; f <= file->f_file[1]; f++)
- if (f)
- nlmsvc_ops->fclose(f);
+ if (file->f_file[O_RDONLY])
+ nlmsvc_ops->fclose(file->f_file[O_RDONLY]);
+ if (file->f_file[O_WRONLY])
+ nlmsvc_ops->fclose(file->f_file[O_WRONLY]);
}
/*
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 856267c0864b..712fdfeb8ef0 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -20,13 +20,6 @@
#include "svcxdr.h"
-static inline loff_t
-s64_to_loff_t(__s64 offset)
-{
- return (loff_t)offset;
-}
-
-
static inline s64
loff_t_to_s64(loff_t offset)
{
@@ -70,8 +63,6 @@ static bool
svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
{
struct file_lock *fl = &lock->fl;
- u64 len, start;
- s64 end;
if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
return false;
@@ -81,20 +72,14 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
return false;
if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
return false;
- if (xdr_stream_decode_u64(xdr, &start) < 0)
+ if (xdr_stream_decode_u64(xdr, &lock->lock_start) < 0)
return false;
- if (xdr_stream_decode_u64(xdr, &len) < 0)
+ if (xdr_stream_decode_u64(xdr, &lock->lock_len) < 0)
return false;
locks_init_lock(fl);
fl->fl_flags = FL_POSIX;
fl->fl_type = F_RDLCK;
- end = start + len - 1;
- fl->fl_start = s64_to_loff_t(start);
- if (len == 0 || end < 0)
- fl->fl_end = OFFSET_MAX;
- else
- fl->fl_end = s64_to_loff_t(end);
return true;
}
diff --git a/fs/locks.c b/fs/locks.c
index 8c6df10cd9ed..607f94a0e789 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -300,6 +300,34 @@ void locks_release_private(struct file_lock *fl)
}
EXPORT_SYMBOL_GPL(locks_release_private);
+/**
+ * locks_owner_has_blockers - Check for blocking lock requests
+ * @flctx: file lock context
+ * @owner: lock owner
+ *
+ * Return values:
+ * %true: @owner has at least one blocker
+ * %false: @owner has no blockers
+ */
+bool locks_owner_has_blockers(struct file_lock_context *flctx,
+ fl_owner_t owner)
+{
+ struct file_lock *fl;
+
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ if (fl->fl_owner != owner)
+ continue;
+ if (!list_empty(&fl->fl_blocked_requests)) {
+ spin_unlock(&flctx->flc_lock);
+ return true;
+ }
+ }
+ spin_unlock(&flctx->flc_lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(locks_owner_has_blockers);
+
/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
@@ -397,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) {
}
/* Fill in a file_lock structure with an appropriate FLOCK lock. */
-static struct file_lock *
-flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
+static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
- int type = flock_translate_cmd(cmd);
-
- if (type < 0)
- return ERR_PTR(type);
-
- if (fl == NULL) {
- fl = locks_alloc_lock();
- if (fl == NULL)
- return ERR_PTR(-ENOMEM);
- } else {
- locks_init_lock(fl);
- }
+ locks_init_lock(fl);
fl->fl_file = filp;
fl->fl_owner = filp;
@@ -419,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
fl->fl_flags = FL_FLOCK;
fl->fl_type = type;
fl->fl_end = OFFSET_MAX;
-
- return fl;
}
static int assign_type(struct file_lock *fl, long type)
@@ -874,6 +888,8 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
struct file_lock *cfl;
struct file_lock_context *ctx;
struct inode *inode = locks_inode(filp);
+ void *owner;
+ void (*func)(void);
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
@@ -881,12 +897,23 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
return;
}
+retry:
spin_lock(&ctx->flc_lock);
list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
- if (posix_locks_conflict(fl, cfl)) {
- locks_copy_conflock(fl, cfl);
- goto out;
+ if (!posix_locks_conflict(fl, cfl))
+ continue;
+ if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
+ && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
+ owner = cfl->fl_lmops->lm_mod_owner;
+ func = cfl->fl_lmops->lm_expire_lock;
+ __module_get(owner);
+ spin_unlock(&ctx->flc_lock);
+ (*func)();
+ module_put(owner);
+ goto retry;
}
+ locks_copy_conflock(fl, cfl);
+ goto out;
}
fl->fl_type = F_UNLCK;
out:
@@ -1060,6 +1087,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
int error;
bool added = false;
LIST_HEAD(dispose);
+ void *owner;
+ void (*func)(void);
ctx = locks_get_lock_context(inode, request->fl_type);
if (!ctx)
@@ -1078,6 +1107,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
new_fl2 = locks_alloc_lock();
}
+retry:
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
@@ -1089,6 +1119,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
if (!posix_locks_conflict(request, fl))
continue;
+ if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
+ && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
+ owner = fl->fl_lmops->lm_mod_owner;
+ func = fl->fl_lmops->lm_expire_lock;
+ __module_get(owner);
+ spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
+ (*func)();
+ module_put(owner);
+ goto retry;
+ }
if (conflock)
locks_copy_conflock(conflock, fl);
error = -EAGAIN;
@@ -2042,21 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
- struct fd f = fdget(fd);
- struct file_lock *lock;
- int can_sleep, unlock;
- int error;
-
- error = -EBADF;
- if (!f.file)
- goto out;
-
- can_sleep = !(cmd & LOCK_NB);
- cmd &= ~LOCK_NB;
- unlock = (cmd == LOCK_UN);
-
- if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
- goto out_putf;
+ int can_sleep, error, type;
+ struct file_lock fl;
+ struct fd f;
/*
* LOCK_MAND locks were broken for a long time in that they never
@@ -2068,36 +2097,42 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
if (cmd & LOCK_MAND) {
pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
- error = 0;
- goto out_putf;
+ return 0;
}
- lock = flock_make_lock(f.file, cmd, NULL);
- if (IS_ERR(lock)) {
- error = PTR_ERR(lock);
+ type = flock_translate_cmd(cmd & ~LOCK_NB);
+ if (type < 0)
+ return type;
+
+ error = -EBADF;
+ f = fdget(fd);
+ if (!f.file)
+ return error;
+
+ if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
goto out_putf;
- }
- if (can_sleep)
- lock->fl_flags |= FL_SLEEP;
+ flock_make_lock(f.file, &fl, type);
- error = security_file_lock(f.file, lock->fl_type);
+ error = security_file_lock(f.file, fl.fl_type);
if (error)
- goto out_free;
+ goto out_putf;
+
+ can_sleep = !(cmd & LOCK_NB);
+ if (can_sleep)
+ fl.fl_flags |= FL_SLEEP;
if (f.file->f_op->flock)
error = f.file->f_op->flock(f.file,
- (can_sleep) ? F_SETLKW : F_SETLK,
- lock);
+ (can_sleep) ? F_SETLKW : F_SETLK,
+ &fl);
else
- error = locks_lock_file_wait(f.file, lock);
-
- out_free:
- locks_free_lock(lock);
+ error = locks_lock_file_wait(f.file, &fl);
+ locks_release_private(&fl);
out_putf:
fdput(f);
- out:
+
return error;
}
@@ -2559,7 +2594,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
if (list_empty(&flctx->flc_flock))
return;
- flock_make_lock(filp, LOCK_UN, &fl);
+ flock_make_lock(filp, &fl, F_UNLCK);
fl.fl_flags |= FL_CLOSE;
if (filp->f_op->flock)
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..e272ad738faf 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -11,7 +11,7 @@
/*
* Mbcache is a simple key-value store. Keys need not be unique, however
* key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete()).
+ * mb_cache_entry_delete_or_get()).
*
* Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
* Ext4 also uses it for deduplication of xattr values stored in inodes.
@@ -90,8 +90,14 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
return -ENOMEM;
INIT_LIST_HEAD(&entry->e_list);
- /* One ref for hash, one ref returned */
- atomic_set(&entry->e_refcnt, 1);
+ /*
+ * We create entry with two references. One reference is kept by the
+ * hash table, the other reference is used to protect us from
+ * mb_cache_entry_delete_or_get() until the entry is fully setup. This
+ * avoids nesting of cache->c_list_lock into hash table bit locks which
+ * is problematic for RT.
+ */
+ atomic_set(&entry->e_refcnt, 2);
entry->e_key = key;
entry->e_value = value;
entry->e_reusable = reusable;
@@ -107,24 +113,41 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
}
hlist_bl_add_head(&entry->e_hash_list, head);
hlist_bl_unlock(head);
-
spin_lock(&cache->c_list_lock);
list_add_tail(&entry->e_list, &cache->c_list);
- /* Grab ref for LRU list */
- atomic_inc(&entry->e_refcnt);
cache->c_entry_count++;
spin_unlock(&cache->c_list_lock);
+ mb_cache_entry_put(cache, entry);
return 0;
}
EXPORT_SYMBOL(mb_cache_entry_create);
-void __mb_cache_entry_free(struct mb_cache_entry *entry)
+void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
{
+ struct hlist_bl_head *head;
+
+ head = mb_cache_entry_head(cache, entry->e_key);
+ hlist_bl_lock(head);
+ hlist_bl_del(&entry->e_hash_list);
+ hlist_bl_unlock(head);
kmem_cache_free(mb_entry_cache, entry);
}
EXPORT_SYMBOL(__mb_cache_entry_free);
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+ wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
struct mb_cache_entry *entry,
u32 key)
@@ -142,10 +165,9 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
while (node) {
entry = hlist_bl_entry(node, struct mb_cache_entry,
e_hash_list);
- if (entry->e_key == key && entry->e_reusable) {
- atomic_inc(&entry->e_refcnt);
+ if (entry->e_key == key && entry->e_reusable &&
+ atomic_inc_not_zero(&entry->e_refcnt))
goto out;
- }
node = node->next;
}
entry = NULL;
@@ -205,10 +227,9 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
- if (entry->e_key == key && entry->e_value == value) {
- atomic_inc(&entry->e_refcnt);
+ if (entry->e_key == key && entry->e_value == value &&
+ atomic_inc_not_zero(&entry->e_refcnt))
goto out;
- }
}
entry = NULL;
out:
@@ -217,42 +238,42 @@ out:
}
EXPORT_SYMBOL(mb_cache_entry_get);
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
* @cache - cache we work with
* @key - key
* @value - value
*
- * Remove entry from cache @cache with key @key and value @value.
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function grabs reference of the entry that we failed to delete because it
+ * still has users and return it.
*/
-void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+ u32 key, u64 value)
{
- struct hlist_bl_node *node;
- struct hlist_bl_head *head;
struct mb_cache_entry *entry;
- head = mb_cache_entry_head(cache, key);
- hlist_bl_lock(head);
- hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
- if (entry->e_key == key && entry->e_value == value) {
- /* We keep hash list reference to keep entry alive */
- hlist_bl_del_init(&entry->e_hash_list);
- hlist_bl_unlock(head);
- spin_lock(&cache->c_list_lock);
- if (!list_empty(&entry->e_list)) {
- list_del_init(&entry->e_list);
- if (!WARN_ONCE(cache->c_entry_count == 0,
- "mbcache: attempt to decrement c_entry_count past zero"))
- cache->c_entry_count--;
- atomic_dec(&entry->e_refcnt);
- }
- spin_unlock(&cache->c_list_lock);
- mb_cache_entry_put(cache, entry);
- return;
- }
- }
- hlist_bl_unlock(head);
+ entry = mb_cache_entry_get(cache, key, value);
+ if (!entry)
+ return NULL;
+
+ /*
+ * Drop the ref we got from mb_cache_entry_get() and the initial hash
+ * ref if we are the last user
+ */
+ if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
+ return entry;
+
+ spin_lock(&cache->c_list_lock);
+ if (!list_empty(&entry->e_list))
+ list_del_init(&entry->e_list);
+ cache->c_entry_count--;
+ spin_unlock(&cache->c_list_lock);
+ __mb_cache_entry_free(cache, entry);
+ return NULL;
}
-EXPORT_SYMBOL(mb_cache_entry_delete);
+EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
/* mb_cache_entry_touch - cache entry got used
* @cache - cache the entry belongs to
@@ -281,34 +302,24 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
unsigned long nr_to_scan)
{
struct mb_cache_entry *entry;
- struct hlist_bl_head *head;
unsigned long shrunk = 0;
spin_lock(&cache->c_list_lock);
while (nr_to_scan-- && !list_empty(&cache->c_list)) {
entry = list_first_entry(&cache->c_list,
struct mb_cache_entry, e_list);
- if (entry->e_referenced) {
+ /* Drop initial hash reference if there is no user */
+ if (entry->e_referenced ||
+ atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
entry->e_referenced = 0;
list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
list_del_init(&entry->e_list);
cache->c_entry_count--;
- /*
- * We keep LRU list reference so that entry doesn't go away
- * from under us.
- */
spin_unlock(&cache->c_list_lock);
- head = mb_cache_entry_head(cache, entry->e_key);
- hlist_bl_lock(head);
- if (!hlist_bl_unhashed(&entry->e_hash_list)) {
- hlist_bl_del_init(&entry->e_hash_list);
- atomic_dec(&entry->e_refcnt);
- }
- hlist_bl_unlock(head);
- if (mb_cache_entry_put(cache, entry))
- shrunk++;
+ __mb_cache_entry_free(cache, entry);
+ shrunk++;
cond_resched();
spin_lock(&cache->c_list_lock);
}
@@ -367,7 +378,7 @@ struct mb_cache *mb_cache_create(int bucket_bits)
cache->c_shrink.count_objects = mb_cache_count;
cache->c_shrink.scan_objects = mb_cache_scan;
cache->c_shrink.seeks = DEFAULT_SEEKS;
- if (register_shrinker(&cache->c_shrink)) {
+ if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) {
kfree(cache->c_hash);
kfree(cache);
goto err_out;
@@ -400,11 +411,6 @@ void mb_cache_destroy(struct mb_cache *cache)
* point.
*/
list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
- if (!hlist_bl_unhashed(&entry->e_hash_list)) {
- hlist_bl_del_init(&entry->e_hash_list);
- atomic_dec(&entry->e_refcnt);
- } else
- WARN_ON(1);
list_del(&entry->e_list);
WARN_ON(atomic_read(&entry->e_refcnt) != 1);
mb_cache_entry_put(cache, entry);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f1a6610e4ee6..da8bdd1712a7 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -402,9 +402,9 @@ static int minix_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, minix_get_block, wbc);
}
-static int minix_readpage(struct file *file, struct page *page)
+static int minix_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,minix_get_block);
+ return block_read_full_folio(folio, minix_get_block);
}
int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
@@ -423,13 +423,12 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
}
static int minix_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- minix_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, minix_get_block);
if (unlikely(ret))
minix_write_failed(mapping, pos + len);
@@ -444,7 +443,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations minix_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = minix_readpage,
+ .read_folio = minix_read_folio,
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
diff --git a/fs/mount.h b/fs/mount.h
index 0b6e08cf8afb..130c07c2f8d2 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -100,7 +100,6 @@ static inline int is_mounted(struct vfsmount *mnt)
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
extern int __legitimize_mnt(struct vfsmount *, unsigned);
-extern bool legitimize_mnt(struct vfsmount *, unsigned);
static inline bool __path_is_mountpoint(const struct path *path)
{
diff --git a/fs/mpage.c b/fs/mpage.c
index 1fe56f8c495f..0f8ae954a579 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -36,7 +36,7 @@
*
* The mpage code never puts partial pages into a BIO (except for end-of-file).
* If a page does not map to a contiguous run of blocks then it simply falls
- * back to block_read_full_page().
+ * back to block_read_full_folio().
*
* Why is this? If a page's completion depends on a number of different BIOs
* which can complete in any order (or at the same time) then determining the
@@ -68,33 +68,35 @@ static struct bio *mpage_bio_submit(struct bio *bio)
/*
* support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
- * the page, which allows readpage to avoid triggering a duplicate call
+ * the page, which allows read_folio to avoid triggering a duplicate call
* to get_block.
*
* The idea is to avoid adding buffers to pages that don't already have
* them. So when the buffer is up to date and the page size == block size,
* this marks the page up to date instead of adding new buffers.
*/
-static void
-map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
+static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
+ int page_block)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct buffer_head *page_bh, *head;
int block = 0;
- if (!page_has_buffers(page)) {
+ head = folio_buffers(folio);
+ if (!head) {
/*
* don't make any buffers if there is only one buffer on
- * the page and the page just needs to be set up to date
+ * the folio and the folio just needs to be set up to date
*/
if (inode->i_blkbits == PAGE_SHIFT &&
buffer_uptodate(bh)) {
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
return;
}
- create_empty_buffers(page, i_blocksize(inode), 0);
+ create_empty_buffers(&folio->page, i_blocksize(inode), 0);
+ head = folio_buffers(folio);
}
- head = page_buffers(page);
+
page_bh = head;
do {
if (block == page_block) {
@@ -110,7 +112,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
struct mpage_readpage_args {
struct bio *bio;
- struct page *page;
+ struct folio *folio;
unsigned int nr_pages;
bool is_readahead;
sector_t last_block_in_bio;
@@ -130,8 +132,8 @@ struct mpage_readpage_args {
*/
static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
{
- struct page *page = args->page;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = args->folio;
+ struct inode *inode = folio->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
@@ -145,20 +147,23 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
- int op = REQ_OP_READ;
+ blk_opf_t opf = REQ_OP_READ;
unsigned nblocks;
unsigned relative_block;
- gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+ gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
+
+ /* MAX_BUF_PER_PAGE, for example */
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
if (args->is_readahead) {
- op |= REQ_RAHEAD;
+ opf |= REQ_RAHEAD;
gfp |= __GFP_NORETRY | __GFP_NOWARN;
}
- if (page_has_buffers(page))
+ if (folio_buffers(folio))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + args->nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
@@ -191,9 +196,9 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
}
/*
- * Then do more get_blocks calls until we are done with this page.
+ * Then do more get_blocks calls until we are done with this folio.
*/
- map_bh->b_page = page;
+ map_bh->b_page = &folio->page;
while (page_block < blocks_per_page) {
map_bh->b_state = 0;
map_bh->b_size = 0;
@@ -216,12 +221,12 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
/* some filesystems will copy data into the page during
* the get_block call, in which case we don't want to
- * read it again. map_buffer_to_page copies the data
- * we just collected from get_block into the page's buffers
- * so readpage doesn't have to repeat the get_block call
+ * read it again. map_buffer_to_folio copies the data
+ * we just collected from get_block into the folio's buffers
+ * so read_folio doesn't have to repeat the get_block call
*/
if (buffer_uptodate(map_bh)) {
- map_buffer_to_page(page, map_bh, page_block);
+ map_buffer_to_folio(folio, map_bh, page_block);
goto confused;
}
@@ -246,18 +251,18 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
}
if (first_hole != blocks_per_page) {
- zero_user_segment(page, first_hole << blkbits, PAGE_SIZE);
+ folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
if (first_hole == 0) {
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
goto out;
}
} else if (fully_mapped) {
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
}
/*
- * This page will go to BIO. Do we need to send this BIO off first?
+ * This folio will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
args->bio = mpage_bio_submit(args->bio);
@@ -266,10 +271,10 @@ alloc_new:
if (args->bio == NULL) {
if (first_hole == blocks_per_page) {
if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
- page))
+ &folio->page))
goto out;
}
- args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), op,
+ args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf,
gfp);
if (args->bio == NULL)
goto confused;
@@ -277,7 +282,7 @@ alloc_new:
}
length = first_hole << blkbits;
- if (bio_add_page(args->bio, page, length, 0) < length) {
+ if (!bio_add_folio(args->bio, folio, length, 0)) {
args->bio = mpage_bio_submit(args->bio);
goto alloc_new;
}
@@ -295,10 +300,10 @@ out:
confused:
if (args->bio)
args->bio = mpage_bio_submit(args->bio);
- if (!PageUptodate(page))
- block_read_full_page(page, args->get_block);
+ if (!folio_test_uptodate(folio))
+ block_read_full_folio(folio, args->get_block);
else
- unlock_page(page);
+ folio_unlock(folio);
goto out;
}
@@ -343,18 +348,17 @@ confused:
*/
void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
{
- struct page *page;
+ struct folio *folio;
struct mpage_readpage_args args = {
.get_block = get_block,
.is_readahead = true,
};
- while ((page = readahead_page(rac))) {
- prefetchw(&page->flags);
- args.page = page;
+ while ((folio = readahead_folio(rac))) {
+ prefetchw(&folio->flags);
+ args.folio = folio;
args.nr_pages = readahead_count(rac);
args.bio = do_mpage_readpage(&args);
- put_page(page);
}
if (args.bio)
mpage_bio_submit(args.bio);
@@ -364,10 +368,10 @@ EXPORT_SYMBOL(mpage_readahead);
/*
* This isn't called much at all
*/
-int mpage_readpage(struct page *page, get_block_t get_block)
+int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
struct mpage_readpage_args args = {
- .page = page,
+ .folio = folio,
.nr_pages = 1,
.get_block = get_block,
};
@@ -377,7 +381,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
mpage_bio_submit(args.bio);
return 0;
}
-EXPORT_SYMBOL(mpage_readpage);
+EXPORT_SYMBOL(mpage_read_folio);
/*
* Writing is not so simple.
@@ -400,7 +404,6 @@ struct mpage_data {
struct bio *bio;
sector_t last_block_in_bio;
get_block_t *get_block;
- unsigned use_writepage;
};
/*
@@ -425,11 +428,11 @@ static void clean_buffers(struct page *page, unsigned first_unmapped)
/*
* we cannot drop the bh if the page is not uptodate or a concurrent
- * readpage would fail to serialize with the bh and it would read from
+ * read_folio would fail to serialize with the bh and it would read from
* disk before we reach the platter.
*/
if (buffer_heads_over_limit && PageUptodate(page))
- try_to_free_buffers(page);
+ try_to_free_buffers(page_folio(page));
}
/*
@@ -510,7 +513,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
/*
* Page has buffers, but they are all unmapped. The page was
* created by pagein or read over a hole which was handled by
- * block_read_full_page(). If this address_space is also
+ * block_read_full_folio(). If this address_space is also
* using mpage_readahead then this can rarely happen.
*/
goto confused;
@@ -620,15 +623,10 @@ confused:
if (bio)
bio = mpage_bio_submit(bio);
- if (mpd->use_writepage) {
- ret = mapping->a_ops->writepage(page, wbc);
- } else {
- ret = -EAGAIN;
- goto out;
- }
/*
* The caller has a ref on the inode, so *mapping is stable
*/
+ ret = block_write_full_page(page, mpd->get_block, wbc);
mapping_set_error(mapping, ret);
out:
mpd->bio = bio;
@@ -640,8 +638,6 @@ out:
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @get_block: the filesystem's block mapper function.
- * If this is NULL then use a_ops->writepage. Otherwise, go
- * direct-to-BIO.
*
* This is a library function, which implements the writepages()
* address_space_operation.
@@ -658,42 +654,17 @@ int
mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block)
{
+ struct mpage_data mpd = {
+ .get_block = get_block,
+ };
struct blk_plug plug;
int ret;
blk_start_plug(&plug);
-
- if (!get_block)
- ret = generic_writepages(mapping, wbc);
- else {
- struct mpage_data mpd = {
- .bio = NULL,
- .last_block_in_bio = 0,
- .get_block = get_block,
- .use_writepage = 1,
- };
-
- ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(mpd.bio);
- }
- blk_finish_plug(&plug);
- return ret;
-}
-EXPORT_SYMBOL(mpage_writepages);
-
-int mpage_writepage(struct page *page, get_block_t get_block,
- struct writeback_control *wbc)
-{
- struct mpage_data mpd = {
- .bio = NULL,
- .last_block_in_bio = 0,
- .get_block = get_block,
- .use_writepage = 0,
- };
- int ret = __mpage_writepage(page, wbc, &mpd);
+ ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio)
mpage_bio_submit(mpd.bio);
+ blk_finish_plug(&plug);
return ret;
}
-EXPORT_SYMBOL(mpage_writepage);
+EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/namei.c b/fs/namei.c
index 3f1829b3ab5b..8533087e5dac 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
+#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
@@ -566,7 +567,7 @@ struct nameidata {
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags, state;
- unsigned seq, m_seq, r_seq;
+ unsigned seq, next_seq, m_seq, r_seq;
int last_type;
unsigned depth;
int total_link_count;
@@ -664,6 +665,13 @@ static void drop_links(struct nameidata *nd)
}
}
+static void leave_rcu(struct nameidata *nd)
+{
+ nd->flags &= ~LOOKUP_RCU;
+ nd->seq = nd->next_seq = 0;
+ rcu_read_unlock();
+}
+
static void terminate_walk(struct nameidata *nd)
{
drop_links(nd);
@@ -677,8 +685,7 @@ static void terminate_walk(struct nameidata *nd)
nd->state &= ~ND_ROOT_GRABBED;
}
} else {
- nd->flags &= ~LOOKUP_RCU;
- rcu_read_unlock();
+ leave_rcu(nd);
}
nd->depth = 0;
nd->path.mnt = NULL;
@@ -729,13 +736,6 @@ static bool legitimize_links(struct nameidata *nd)
static bool legitimize_root(struct nameidata *nd)
{
- /*
- * For scoped-lookups (where nd->root has been zeroed), we need to
- * restart the whole lookup from scratch -- because set_root() is wrong
- * for these lookups (nd->dfd is the root, not the filesystem root).
- */
- if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
- return false;
/* Nothing to do if nd->root is zero or is managed by the VFS user. */
if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
return true;
@@ -771,14 +771,13 @@ static bool try_to_unlazy(struct nameidata *nd)
BUG_ON(!(nd->flags & LOOKUP_RCU));
- nd->flags &= ~LOOKUP_RCU;
if (unlikely(!legitimize_links(nd)))
goto out1;
if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
goto out;
if (unlikely(!legitimize_root(nd)))
goto out;
- rcu_read_unlock();
+ leave_rcu(nd);
BUG_ON(nd->inode != parent->d_inode);
return true;
@@ -786,7 +785,7 @@ out1:
nd->path.mnt = NULL;
nd->path.dentry = NULL;
out:
- rcu_read_unlock();
+ leave_rcu(nd);
return false;
}
@@ -794,24 +793,27 @@ out:
* try_to_unlazy_next - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
* @dentry: next dentry to step into
- * @seq: seq number to check @dentry against
* Returns: true on success, false on failure
*
- * Similar to to try_to_unlazy(), but here we have the next dentry already
+ * Similar to try_to_unlazy(), but here we have the next dentry already
* picked by rcu-walk and want to legitimize that in addition to the current
* nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
* Nothing should touch nameidata between try_to_unlazy_next() failure and
* terminate_walk().
*/
-static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
+static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
+ int res;
BUG_ON(!(nd->flags & LOOKUP_RCU));
- nd->flags &= ~LOOKUP_RCU;
if (unlikely(!legitimize_links(nd)))
goto out2;
- if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
- goto out2;
+ res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
+ if (unlikely(res)) {
+ if (res > 0)
+ goto out2;
+ goto out1;
+ }
if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
goto out1;
@@ -824,7 +826,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi
*/
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
goto out;
- if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
+ if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
goto out_dput;
/*
* Sequence counts matched. Now make sure that the root is
@@ -832,7 +834,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi
*/
if (unlikely(!legitimize_root(nd)))
goto out_dput;
- rcu_read_unlock();
+ leave_rcu(nd);
return true;
out2:
@@ -840,10 +842,10 @@ out2:
out1:
nd->path.dentry = NULL;
out:
- rcu_read_unlock();
+ leave_rcu(nd);
return false;
out_dput:
- rcu_read_unlock();
+ leave_rcu(nd);
dput(dentry);
return false;
}
@@ -968,7 +970,7 @@ static int nd_jump_root(struct nameidata *nd)
d = nd->path.dentry;
nd->inode = d->d_inode;
nd->seq = nd->root_seq;
- if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+ if (read_seqcount_retry(&d->d_seq, nd->seq))
return -ECHILD;
} else {
path_put(&nd->path);
@@ -984,7 +986,7 @@ static int nd_jump_root(struct nameidata *nd)
* Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
-int nd_jump_link(struct path *path)
+int nd_jump_link(const struct path *path)
{
int error = -ELOOP;
struct nameidata *nd = current->nameidata;
@@ -1031,7 +1033,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_symlinks",
.data = &sysctl_protected_symlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -1040,7 +1042,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_hardlinks",
.data = &sysctl_protected_hardlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -1049,7 +1051,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_fifos",
.data = &sysctl_protected_fifos,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
@@ -1058,7 +1060,7 @@ static struct ctl_table namei_sysctls[] = {
.procname = "protected_regular",
.data = &sysctl_protected_regular,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
@@ -1176,7 +1178,7 @@ static bool safe_hardlink_source(struct user_namespace *mnt_userns,
*
* Returns 0 if successful, -ve on error.
*/
-int may_linkat(struct user_namespace *mnt_userns, struct path *link)
+int may_linkat(struct user_namespace *mnt_userns, const struct path *link)
{
struct inode *inode = link->dentry->d_inode;
@@ -1472,8 +1474,7 @@ EXPORT_SYMBOL(follow_down);
* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
* we meet a managed dentry that would need blocking.
*/
-static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
- struct inode **inode, unsigned *seqp)
+static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
{
struct dentry *dentry = path->dentry;
unsigned int flags = dentry->d_flags;
@@ -1502,15 +1503,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
path->mnt = &mounted->mnt;
dentry = path->dentry = mounted->mnt.mnt_root;
nd->state |= ND_JUMPED;
- *seqp = read_seqcount_begin(&dentry->d_seq);
- *inode = dentry->d_inode;
- /*
- * We don't need to re-check ->d_seq after this
- * ->d_inode read - there will be an RCU delay
- * between mount hash removal and ->mnt_root
- * becoming unpinned.
- */
+ nd->next_seq = read_seqcount_begin(&dentry->d_seq);
flags = dentry->d_flags;
+ // makes sure that non-RCU pathwalk could reach
+ // this state.
+ if (read_seqretry(&mount_lock, nd->m_seq))
+ return false;
continue;
}
if (read_seqretry(&mount_lock, nd->m_seq))
@@ -1521,8 +1519,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
}
static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
- struct path *path, struct inode **inode,
- unsigned int *seqp)
+ struct path *path)
{
bool jumped;
int ret;
@@ -1530,16 +1527,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
path->mnt = nd->path.mnt;
path->dentry = dentry;
if (nd->flags & LOOKUP_RCU) {
- unsigned int seq = *seqp;
- if (unlikely(!*inode))
- return -ENOENT;
- if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+ unsigned int seq = nd->next_seq;
+ if (likely(__follow_mount_rcu(nd, path)))
return 0;
- if (!try_to_unlazy_next(nd, dentry, seq))
- return -ECHILD;
- // *path might've been clobbered by __follow_mount_rcu()
+ // *path and nd->next_seq might've been clobbered
path->mnt = nd->path.mnt;
path->dentry = dentry;
+ nd->next_seq = seq;
+ if (!try_to_unlazy_next(nd, dentry))
+ return -ECHILD;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
if (jumped) {
@@ -1552,9 +1548,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
dput(path->dentry);
if (path->mnt != nd->path.mnt)
mntput(path->mnt);
- } else {
- *inode = d_backing_inode(path->dentry);
- *seqp = 0; /* out of RCU mode, so the value doesn't matter */
}
return ret;
}
@@ -1613,9 +1606,7 @@ static struct dentry *__lookup_hash(const struct qstr *name,
return dentry;
}
-static struct dentry *lookup_fast(struct nameidata *nd,
- struct inode **inode,
- unsigned *seqp)
+static struct dentry *lookup_fast(struct nameidata *nd)
{
struct dentry *dentry, *parent = nd->path.dentry;
int status = 1;
@@ -1626,8 +1617,7 @@ static struct dentry *lookup_fast(struct nameidata *nd,
* going to fall back to non-racy lookup.
*/
if (nd->flags & LOOKUP_RCU) {
- unsigned seq;
- dentry = __d_lookup_rcu(parent, &nd->last, &seq);
+ dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
if (unlikely(!dentry)) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
@@ -1635,28 +1625,16 @@ static struct dentry *lookup_fast(struct nameidata *nd,
}
/*
- * This sequence count validates that the inode matches
- * the dentry name information from lookup.
- */
- *inode = d_backing_inode(dentry);
- if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
- return ERR_PTR(-ECHILD);
-
- /*
* This sequence count validates that the parent had no
* changes while we did the lookup of the dentry above.
- *
- * The memory barrier in read_seqcount_begin of child is
- * enough, we can use __read_seqcount_retry here.
*/
- if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
+ if (read_seqcount_retry(&parent->d_seq, nd->seq))
return ERR_PTR(-ECHILD);
- *seqp = seq;
status = d_revalidate(dentry, nd->flags);
if (likely(status > 0))
return dentry;
- if (!try_to_unlazy_next(nd, dentry, seq))
+ if (!try_to_unlazy_next(nd, dentry))
return ERR_PTR(-ECHILD);
if (status == -ECHILD)
/* we'd been told to redo it in non-rcu mode */
@@ -1737,7 +1715,7 @@ static inline int may_lookup(struct user_namespace *mnt_userns,
return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
}
-static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
+static int reserve_stack(struct nameidata *nd, struct path *link)
{
if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
return -ELOOP;
@@ -1752,9 +1730,9 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
if (nd->flags & LOOKUP_RCU) {
// we need to grab link before we do unlazy. And we can't skip
// unlazy even if we fail to grab the link - cleanup needs it
- bool grabbed_link = legitimize_path(nd, link, seq);
+ bool grabbed_link = legitimize_path(nd, link, nd->next_seq);
- if (!try_to_unlazy(nd) != 0 || !grabbed_link)
+ if (!try_to_unlazy(nd) || !grabbed_link)
return -ECHILD;
if (nd_alloc_stack(nd))
@@ -1766,11 +1744,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
static const char *pick_link(struct nameidata *nd, struct path *link,
- struct inode *inode, unsigned seq, int flags)
+ struct inode *inode, int flags)
{
struct saved *last;
const char *res;
- int error = reserve_stack(nd, link, seq);
+ int error = reserve_stack(nd, link);
if (unlikely(error)) {
if (!(nd->flags & LOOKUP_RCU))
@@ -1780,7 +1758,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
last = nd->stack + nd->depth++;
last->link = *link;
clear_delayed_call(&last->done);
- last->seq = seq;
+ last->seq = nd->next_seq;
if (flags & WALK_TRAILING) {
error = may_follow_link(nd, inode);
@@ -1842,43 +1820,50 @@ all_done: // pure jump
* to do this check without having to look at inode->i_op,
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
+ *
+ * NOTE: dentry must be what nd->next_seq had been sampled from.
*/
static const char *step_into(struct nameidata *nd, int flags,
- struct dentry *dentry, struct inode *inode, unsigned seq)
+ struct dentry *dentry)
{
struct path path;
- int err = handle_mounts(nd, dentry, &path, &inode, &seq);
+ struct inode *inode;
+ int err = handle_mounts(nd, dentry, &path);
if (err < 0)
return ERR_PTR(err);
+ inode = path.dentry->d_inode;
if (likely(!d_is_symlink(path.dentry)) ||
((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
(flags & WALK_NOFOLLOW)) {
/* not a symlink or should not follow */
- if (!(nd->flags & LOOKUP_RCU)) {
+ if (nd->flags & LOOKUP_RCU) {
+ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
+ return ERR_PTR(-ECHILD);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOENT);
+ } else {
dput(nd->path.dentry);
if (nd->path.mnt != path.mnt)
mntput(nd->path.mnt);
}
nd->path = path;
nd->inode = inode;
- nd->seq = seq;
+ nd->seq = nd->next_seq;
return NULL;
}
if (nd->flags & LOOKUP_RCU) {
/* make sure that d_is_symlink above matches inode */
- if (read_seqcount_retry(&path.dentry->d_seq, seq))
+ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
return ERR_PTR(-ECHILD);
} else {
if (path.mnt == nd->path.mnt)
mntget(path.mnt);
}
- return pick_link(nd, &path, inode, seq, flags);
+ return pick_link(nd, &path, inode, flags);
}
-static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
- struct inode **inodep,
- unsigned *seqp)
+static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
{
struct dentry *parent, *old;
@@ -1895,30 +1880,30 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
nd->path = path;
nd->inode = path.dentry->d_inode;
nd->seq = seq;
- if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ // makes sure that non-RCU pathwalk could reach this state
+ if (read_seqretry(&mount_lock, nd->m_seq))
return ERR_PTR(-ECHILD);
/* we know that mountpoint was pinned */
}
old = nd->path.dentry;
parent = old->d_parent;
- *inodep = parent->d_inode;
- *seqp = read_seqcount_begin(&parent->d_seq);
- if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+ nd->next_seq = read_seqcount_begin(&parent->d_seq);
+ // makes sure that non-RCU pathwalk could reach this state
+ if (read_seqcount_retry(&old->d_seq, nd->seq))
return ERR_PTR(-ECHILD);
if (unlikely(!path_connected(nd->path.mnt, parent)))
return ERR_PTR(-ECHILD);
return parent;
in_root:
- if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ if (read_seqretry(&mount_lock, nd->m_seq))
return ERR_PTR(-ECHILD);
if (unlikely(nd->flags & LOOKUP_BENEATH))
return ERR_PTR(-ECHILD);
- return NULL;
+ nd->next_seq = nd->seq;
+ return nd->path.dentry;
}
-static struct dentry *follow_dotdot(struct nameidata *nd,
- struct inode **inodep,
- unsigned *seqp)
+static struct dentry *follow_dotdot(struct nameidata *nd)
{
struct dentry *parent;
@@ -1942,15 +1927,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd,
dput(parent);
return ERR_PTR(-ENOENT);
}
- *seqp = 0;
- *inodep = parent->d_inode;
return parent;
in_root:
if (unlikely(nd->flags & LOOKUP_BENEATH))
return ERR_PTR(-EXDEV);
- dget(nd->path.dentry);
- return NULL;
+ return dget(nd->path.dentry);
}
static const char *handle_dots(struct nameidata *nd, int type)
@@ -1958,8 +1940,6 @@ static const char *handle_dots(struct nameidata *nd, int type)
if (type == LAST_DOTDOT) {
const char *error = NULL;
struct dentry *parent;
- struct inode *inode;
- unsigned seq;
if (!nd->root.mnt) {
error = ERR_PTR(set_root(nd));
@@ -1967,17 +1947,12 @@ static const char *handle_dots(struct nameidata *nd, int type)
return error;
}
if (nd->flags & LOOKUP_RCU)
- parent = follow_dotdot_rcu(nd, &inode, &seq);
+ parent = follow_dotdot_rcu(nd);
else
- parent = follow_dotdot(nd, &inode, &seq);
+ parent = follow_dotdot(nd);
if (IS_ERR(parent))
return ERR_CAST(parent);
- if (unlikely(!parent))
- error = step_into(nd, WALK_NOFOLLOW,
- nd->path.dentry, nd->inode, nd->seq);
- else
- error = step_into(nd, WALK_NOFOLLOW,
- parent, inode, seq);
+ error = step_into(nd, WALK_NOFOLLOW, parent);
if (unlikely(error))
return error;
@@ -1989,9 +1964,9 @@ static const char *handle_dots(struct nameidata *nd, int type)
* some fallback).
*/
smp_rmb();
- if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
+ if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
return ERR_PTR(-EAGAIN);
- if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
+ if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
return ERR_PTR(-EAGAIN);
}
}
@@ -2001,8 +1976,6 @@ static const char *handle_dots(struct nameidata *nd, int type)
static const char *walk_component(struct nameidata *nd, int flags)
{
struct dentry *dentry;
- struct inode *inode;
- unsigned seq;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
@@ -2013,7 +1986,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
put_link(nd);
return handle_dots(nd, nd->last_type);
}
- dentry = lookup_fast(nd, &inode, &seq);
+ dentry = lookup_fast(nd);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (unlikely(!dentry)) {
@@ -2023,7 +1996,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
}
if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
- return step_into(nd, flags, dentry, inode, seq);
+ return step_into(nd, flags, dentry);
}
/*
@@ -2378,6 +2351,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
rcu_read_lock();
+ else
+ nd->seq = nd->next_seq = 0;
nd->flags = flags;
nd->state |= ND_JUMPED;
@@ -2479,8 +2454,8 @@ static int handle_lookup_down(struct nameidata *nd)
{
if (!(nd->flags & LOOKUP_RCU))
dget(nd->path.dentry);
- return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
- nd->path.dentry, nd->inode, nd->seq));
+ nd->next_seq = nd->seq;
+ return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -2768,7 +2743,8 @@ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
EXPORT_SYMBOL(lookup_one);
/**
- * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * lookup_one_unlocked - filesystem helper to lookup single pathname component
+ * @mnt_userns: idmapping of the mount the lookup is performed from
* @name: pathname component to lookup
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
@@ -2779,14 +2755,15 @@ EXPORT_SYMBOL(lookup_one);
* Unlike lookup_one_len, it should be called without the parent
* i_mutex held, and will take the i_mutex itself if necessary.
*/
-struct dentry *lookup_one_len_unlocked(const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns,
+ const char *name, struct dentry *base,
+ int len)
{
struct qstr this;
int err;
struct dentry *ret;
- err = lookup_one_common(&init_user_ns, name, base, len, &this);
+ err = lookup_one_common(mnt_userns, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -2795,6 +2772,59 @@ struct dentry *lookup_one_len_unlocked(const char *name,
ret = lookup_slow(&this, base, 0);
return ret;
}
+EXPORT_SYMBOL(lookup_one_unlocked);
+
+/**
+ * lookup_one_positive_unlocked - filesystem helper to lookup single
+ * pathname component
+ * @mnt_userns: idmapping of the mount the lookup is performed from
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
+ * known positive or ERR_PTR(). This is what most of the users want.
+ *
+ * Note that pinned negative with unlocked parent _can_ become positive at any
+ * time, so callers of lookup_one_unlocked() need to be very careful; pinned
+ * positives have >d_inode stable, so this one avoids such problems.
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * The helper should be called without i_mutex held.
+ */
+struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns,
+ const char *name,
+ struct dentry *base, int len)
+{
+ struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len);
+
+ if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_positive_unlocked);
+
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ return lookup_one_unlocked(&init_user_ns, name, base, len);
+}
EXPORT_SYMBOL(lookup_one_len_unlocked);
/*
@@ -2808,12 +2838,7 @@ EXPORT_SYMBOL(lookup_one_len_unlocked);
struct dentry *lookup_positive_unlocked(const char *name,
struct dentry *base, int len)
{
- struct dentry *ret = lookup_one_len_unlocked(name, base, len);
- if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
- dput(ret);
- ret = ERR_PTR(-ENOENT);
- }
- return ret;
+ return lookup_one_positive_unlocked(&init_user_ns, name, base, len);
}
EXPORT_SYMBOL(lookup_positive_unlocked);
@@ -2999,6 +3024,65 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
EXPORT_SYMBOL(unlock_rename);
/**
+ * mode_strip_umask - handle vfs umask stripping
+ * @dir: parent directory of the new inode
+ * @mode: mode of the new inode to be created in @dir
+ *
+ * Umask stripping depends on whether or not the filesystem supports POSIX
+ * ACLs. If the filesystem doesn't support it umask stripping is done directly
+ * in here. If the filesystem does support POSIX ACLs umask stripping is
+ * deferred until the filesystem calls posix_acl_create().
+ *
+ * Returns: mode
+ */
+static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
+{
+ if (!IS_POSIXACL(dir))
+ mode &= ~current_umask();
+ return mode;
+}
+
+/**
+ * vfs_prepare_mode - prepare the mode to be used for a new inode
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dir: parent directory of the new inode
+ * @mode: mode of the new inode
+ * @mask_perms: allowed permission by the vfs
+ * @type: type of file to be created
+ *
+ * This helper consolidates and enforces vfs restrictions on the @mode of a new
+ * object to be created.
+ *
+ * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
+ * the kernel documentation for mode_strip_umask()). Moving umask stripping
+ * after setgid stripping allows the same ordering for both non-POSIX ACL and
+ * POSIX ACL supporting filesystems.
+ *
+ * Note that it's currently valid for @type to be 0 if a directory is created.
+ * Filesystems raise that flag individually and we need to check whether each
+ * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
+ * non-zero type.
+ *
+ * Returns: mode to be passed to the filesystem
+ */
+static inline umode_t vfs_prepare_mode(struct user_namespace *mnt_userns,
+ const struct inode *dir, umode_t mode,
+ umode_t mask_perms, umode_t type)
+{
+ mode = mode_strip_sgid(mnt_userns, dir, mode);
+ mode = mode_strip_umask(dir, mode);
+
+ /*
+ * Apply the vfs mandated allowed permission mask and set the type of
+ * file to be created before we call into the filesystem.
+ */
+ mode &= (mask_perms & ~S_IFMT);
+ mode |= (type & S_IFMT);
+
+ return mode;
+}
+
+/**
* vfs_create - create new file
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: inode of @dentry
@@ -3023,8 +3107,8 @@ int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
- mode &= S_IALLUGO;
- mode |= S_IFREG;
+
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IALLUGO, S_IFREG);
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
@@ -3287,8 +3371,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL)
open_flag &= ~O_TRUNC;
- if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current_umask();
+ mode = vfs_prepare_mode(mnt_userns, dir->d_inode, mode, mode, mode);
if (likely(got_write))
create_error = may_o_create(mnt_userns, &nd->path,
dentry, mode);
@@ -3349,8 +3432,6 @@ static const char *open_last_lookups(struct nameidata *nd,
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool got_write = false;
- unsigned seq;
- struct inode *inode;
struct dentry *dentry;
const char *res;
@@ -3366,7 +3447,7 @@ static const char *open_last_lookups(struct nameidata *nd,
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
- dentry = lookup_fast(nd, &inode, &seq);
+ dentry = lookup_fast(nd);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (likely(dentry))
@@ -3420,7 +3501,7 @@ static const char *open_last_lookups(struct nameidata *nd,
finish_lookup:
if (nd->depth)
put_link(nd);
- res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
+ res = step_into(nd, WALK_TRAILING, dentry);
if (unlikely(res))
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
return res;
@@ -3521,6 +3602,7 @@ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
child = d_alloc(dentry, &slash_name);
if (unlikely(!child))
goto out_err;
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode);
error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
if (error)
goto out_err;
@@ -3673,18 +3755,14 @@ static struct dentry *filename_create(int dfd, struct filename *name,
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct qstr last;
+ bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
+ unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
+ unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
int type;
int err2;
int error;
- bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
-
- /*
- * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
- * other flags passed in are ignored!
- */
- lookup_flags &= LOOKUP_REVAL;
- error = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
if (error)
return ERR_PTR(error);
@@ -3698,11 +3776,13 @@ static struct dentry *filename_create(int dfd, struct filename *name,
/* don't fail immediately if it's r/o, at least try to report other errors */
err2 = mnt_want_write(path->mnt);
/*
- * Do the final lookup.
+ * Do the final lookup. Suppress 'create' if there is a trailing
+ * '/', and a directory wasn't requested.
*/
- lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+ if (last.name[last.len] && !want_dir)
+ create_flags = 0;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- dentry = __lookup_hash(&last, path->dentry, lookup_flags);
+ dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
@@ -3716,7 +3796,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
- if (unlikely(!is_dir && last.name[last.len])) {
+ if (unlikely(!create_flags)) {
error = -ENOENT;
goto fail;
}
@@ -3800,6 +3880,7 @@ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (!dir->i_op->mknod)
return -EPERM;
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, mode, mode);
error = devcgroup_inode_mknod(mode, dev);
if (error)
return error;
@@ -3850,9 +3931,8 @@ retry:
if (IS_ERR(dentry))
goto out1;
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
- error = security_path_mknod(&path, dentry, mode, dev);
+ error = security_path_mknod(&path, dentry,
+ mode_strip_umask(path.dentry->d_inode, mode), dev);
if (error)
goto out2;
@@ -3922,7 +4002,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (!dir->i_op->mkdir)
return -EPERM;
- mode &= (S_IRWXUGO|S_ISVTX);
+ mode = vfs_prepare_mode(mnt_userns, dir, mode, S_IRWXUGO | S_ISVTX, 0);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
return error;
@@ -3950,9 +4030,8 @@ retry:
if (IS_ERR(dentry))
goto out_putname;
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
- error = security_path_mkdir(&path, dentry, mode);
+ error = security_path_mkdir(&path, dentry,
+ mode_strip_umask(path.dentry->d_inode, mode));
if (!error) {
struct user_namespace *mnt_userns;
mnt_userns = mnt_user_ns(path.mnt);
@@ -5003,28 +5082,28 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
}
EXPORT_SYMBOL(page_readlink);
-/*
- * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
- */
-int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
+int page_symlink(struct inode *inode, const char *symname, int len)
{
struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
struct page *page;
void *fsdata;
int err;
- unsigned int flags = 0;
- if (nofs)
- flags |= AOP_FLAG_NOFS;
+ unsigned int flags;
retry:
- err = pagecache_write_begin(NULL, mapping, 0, len-1,
- flags, &page, &fsdata);
+ if (nofs)
+ flags = memalloc_nofs_save();
+ err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
+ if (nofs)
+ memalloc_nofs_restore(flags);
if (err)
goto fail;
memcpy(page_address(page), symname, len-1);
- err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
+ err = aops->write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
if (err < 0)
goto fail;
@@ -5036,13 +5115,6 @@ retry:
fail:
return err;
}
-EXPORT_SYMBOL(__page_symlink);
-
-int page_symlink(struct inode *inode, const char *symname, int len)
-{
- return __page_symlink(inode, symname, len,
- !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
-}
EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
diff --git a/fs/namespace.c b/fs/namespace.c
index a0a36bfa3aa0..df137ba19d37 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -648,7 +648,7 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
}
/* call under rcu_read_lock */
-bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
int res = __legitimize_mnt(bastard, seq);
if (likely(!res))
@@ -1760,7 +1760,7 @@ out_unlock:
/*
* Is the caller allowed to modify his namespace?
*/
-static inline bool may_mount(void)
+bool may_mount(void)
{
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
@@ -4026,8 +4026,9 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
const struct mount *mnt)
{
- return !(kattr->attr_set & MNT_READONLY) ||
- (mnt->mnt.mnt_flags & MNT_READONLY);
+ return (!(kattr->attr_set & MNT_READONLY) ||
+ (mnt->mnt.mnt_flags & MNT_READONLY)) &&
+ !kattr->mnt_userns;
}
static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
@@ -4058,10 +4059,22 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
if (err) {
struct mount *p;
- for (p = mnt; p != m; p = next_mnt(p, mnt)) {
+ /*
+ * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
+ * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
+ * mounts and needs to take care to include the first mount.
+ */
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
/* If we had to hold writers unblock them. */
if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt_unhold_writers(p);
+
+ /*
+ * We're done once the first mount we changed got
+ * MNT_WRITE_HOLD unset.
+ */
+ if (p == m)
+ break;
}
}
return err;
@@ -4225,6 +4238,13 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
err = -EPERM;
goto out_fput;
}
+
+ /* We're not controlling the target namespace. */
+ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out_fput;
+ }
+
kattr->mnt_userns = get_user_ns(mnt_userns);
out_fput:
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 281a88a5b8dc..0ce535852151 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -155,7 +155,7 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
void netfs_readahead(struct readahead_control *ractl)
{
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
+ struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
int ret;
_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
@@ -198,25 +198,24 @@ cleanup_free:
EXPORT_SYMBOL(netfs_readahead);
/**
- * netfs_readpage - Helper to manage a readpage request
+ * netfs_read_folio - Helper to manage a read_folio request
* @file: The file to read from
- * @subpage: A subpage of the folio to read
+ * @folio: The folio to read
*
- * Fulfil a readpage request by drawing data from the cache if possible, or the
- * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests
- * from different sources will get munged together.
+ * Fulfil a read_folio request by drawing data from the cache if
+ * possible, or the netfs if not. Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled.
*/
-int netfs_readpage(struct file *file, struct page *subpage)
+int netfs_read_folio(struct file *file, struct folio *folio)
{
- struct folio *folio = page_folio(subpage);
struct address_space *mapping = folio_file_mapping(folio);
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(mapping->host);
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
int ret;
_enter("%lx", folio_index(folio));
@@ -245,7 +244,7 @@ alloc_error:
folio_unlock(folio);
return ret;
}
-EXPORT_SYMBOL(netfs_readpage);
+EXPORT_SYMBOL(netfs_read_folio);
/*
* Prepare a folio for writing without reading first
@@ -298,11 +297,11 @@ zero_out:
/**
* netfs_write_begin - Helper to prepare for writing
+ * @ctx: The netfs context
* @file: The file to read from
* @mapping: The mapping to read from
* @pos: File position at which the write will begin
* @len: The length of the write (may extend beyond the end of the folio chosen)
- * @aop_flags: AOP_* flags
* @_folio: Where to put the resultant folio
* @_fsdata: Place for the netfs to store a cookie
*
@@ -320,31 +319,29 @@ zero_out:
* conflicting writes once the folio is grabbed and locked. It is passed a
* pointer to the fsdata cookie that gets returned to the VM to be passed to
* write_end. It is permitted to sleep. It should return 0 if the request
- * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
- * be regot; or return an error.
+ * should go ahead or it may return an error. It may also unlock and put the
+ * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
+ * will cause the folio to be re-got and the process to be retried.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled.
*/
-int netfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int aop_flags,
- struct folio **_folio, void **_fsdata)
+int netfs_write_begin(struct netfs_inode *ctx,
+ struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, struct folio **_folio,
+ void **_fsdata)
{
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
struct folio *folio;
- unsigned int fgp_flags;
+ unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
retry:
- fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (aop_flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (!folio)
@@ -352,13 +349,13 @@ retry:
if (ctx->ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
- ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
+ ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
if (ret < 0) {
trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
- if (ret == -EAGAIN)
- goto retry;
goto error;
}
+ if (!folio)
+ goto retry;
}
if (folio_test_uptodate(folio))
@@ -420,8 +417,10 @@ have_folio_no_wait:
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
error:
- folio_unlock(folio);
- folio_put(folio);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index b7b0e3d18d9e..43fac1b14e40 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -91,7 +91,7 @@ static inline void netfs_stat_d(atomic_t *stat)
/*
* Miscellaneous functions.
*/
-static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx)
+static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
{
#if IS_ENABLED(CONFIG_FSCACHE)
struct fscache_cookie *cookie = ctx->cache;
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e86107b30ba4..e17cdf53f6a7 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -18,7 +18,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
{
static atomic_t debug_ids;
struct inode *inode = file ? file_inode(file) : mapping->host;
- struct netfs_i_context *ctx = netfs_i_context(inode);
+ struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
int ret;
@@ -75,10 +75,10 @@ static void netfs_free_request(struct work_struct *work)
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
- netfs_clear_subrequests(rreq, false);
- if (rreq->netfs_priv)
- rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_clear_subrequests(rreq, false);
+ if (rreq->netfs_ops->free_request)
+ rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
kfree(rreq);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 47a53b3362b6..14a72224b657 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -4,10 +4,6 @@ config NFS_FS
depends on INET && FILE_LOCKING && MULTIUSER
select LOCKD
select SUNRPC
- select CRYPTO
- select CRYPTO_HASH
- select XXHASH
- select CRYPTO_XXHASH
select NFS_ACL_SUPPORT if NFS_V3_ACL
help
Choose Y here if you want to access files residing on other
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 79a8b451791f..943aeea1eb16 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -121,7 +121,7 @@ static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
}
static struct bio *
-do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect,
struct page *page, struct pnfs_block_dev_map *map,
struct pnfs_block_extent *be, bio_end_io_t end_io,
struct parallel_io *par, unsigned int offset, int *len)
@@ -131,7 +131,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
u64 disk_addr, end;
dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
- npg, rw, (unsigned long long)isect, offset, *len);
+ npg, (__force u32)op, (unsigned long long)isect, offset, *len);
/* translate to device offset */
isect += be->be_v_offset;
@@ -154,7 +154,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
retry:
if (!bio) {
- bio = bio_alloc(map->bdev, bio_max_segs(npg), rw, GFP_NOIO);
+ bio = bio_alloc(map->bdev, bio_max_segs(npg), op, GFP_NOIO);
bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT;
bio->bi_end_io = end_io;
bio->bi_private = par;
@@ -291,7 +291,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
} else {
bio = do_add_page_to_bio(bio,
header->page_array.npages - i,
- READ,
+ REQ_OP_READ,
isect, pages[i], &map, &be,
bl_end_io_read, par,
pg_offset, &pg_len);
@@ -420,9 +420,8 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
pg_len = PAGE_SIZE;
bio = do_add_page_to_bio(bio, header->page_array.npages - i,
- WRITE, isect, pages[i], &map, &be,
- bl_end_io_write, par,
- 0, &pg_len);
+ REQ_OP_WRITE, isect, pages[i], &map,
+ &be, bl_end_io_write, par, 0, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 5e56da748b2a..fea5f8821da5 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -301,18 +301,14 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
-/*
- * Try to open the udev path for the WWN. At least on Debian the udev
- * by-id path will always point to the dm-multipath device if one exists.
- */
static struct block_device *
-bl_open_udev_path(struct pnfs_block_volume *v)
+bl_open_path(struct pnfs_block_volume *v, const char *prefix)
{
struct block_device *bdev;
const char *devname;
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
- v->scsi.designator_len, v->scsi.designator);
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
+ prefix, v->scsi.designator_len, v->scsi.designator);
if (!devname)
return ERR_PTR(-ENOMEM);
@@ -326,28 +322,6 @@ bl_open_udev_path(struct pnfs_block_volume *v)
return bdev;
}
-/*
- * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
- * wwn- links will only point to the first discovered SCSI device there.
- */
-static struct block_device *
-bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
-{
- struct block_device *bdev;
- const char *devname;
-
- devname = kasprintf(GFP_KERNEL,
- "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
- v->scsi.designator_type,
- v->scsi.designator_len, v->scsi.designator);
- if (!devname)
- return ERR_PTR(-ENOMEM);
-
- bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
- kfree(devname);
- return bdev;
-}
-
static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -360,9 +334,15 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
if (!bl_validate_designator(v))
return -EINVAL;
- bdev = bl_open_dm_mpath_udev_path(v);
+ /*
+ * Try to open the RH/Fedora specific dm-mpath udev path first, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ * On other distributions like Debian, the default SCSI by-id path will
+ * point to the dm-multipath device if one exists.
+ */
+ bdev = bl_open_path(v, "dm-uuid-mpath-0x");
if (IS_ERR(bdev))
- bdev = bl_open_udev_path(v);
+ bdev = bl_open_path(v, "wwn-0x");
if (IS_ERR(bdev))
return PTR_ERR(bdev);
d->bdev = bdev;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c8520284dda7..c1eda73254e1 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -288,6 +288,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
rv = NFS4_OK;
break;
case -ENOENT:
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
/* Embrace your forgetfulness! */
rv = NFS4ERR_NOMATCHING_LAYOUT;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 8dcb08e1a885..d0cccddb7d08 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1065,6 +1065,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
.pc_func = nfs4_callback_compound,
.pc_encode = nfs4_encode_void,
.pc_argsize = 256,
+ .pc_argzero = 256,
.pc_ressize = 256,
.pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
.pc_name = "COMPOUND",
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e828504cc396..da8da5cdbbc1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -708,9 +708,9 @@ static int nfs_init_server(struct nfs_server *server,
}
if (ctx->rsize)
- server->rsize = nfs_block_size(ctx->rsize, NULL);
+ server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto);
if (ctx->wsize)
- server->wsize = nfs_block_size(ctx->wsize, NULL);
+ server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto);
server->acregmin = ctx->acregmin * HZ;
server->acregmax = ctx->acregmax * HZ;
@@ -755,18 +755,19 @@ error:
static void nfs_server_set_fsinfo(struct nfs_server *server,
struct nfs_fsinfo *fsinfo)
{
+ struct nfs_client *clp = server->nfs_client;
unsigned long max_rpc_payload, raw_max_rpc_payload;
/* Work out a lot of parameters */
if (server->rsize == 0)
- server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+ server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto);
if (server->wsize == 0)
- server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+ server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto);
if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
- server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+ server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto);
if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
- server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+ server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto);
raw_max_rpc_payload = rpc_max_payload(server->client);
max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bac4cf1a308e..58036f657126 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -39,7 +39,7 @@
#include <linux/sched.h>
#include <linux/kmemleak.h>
#include <linux/xattr.h>
-#include <linux/xxhash.h>
+#include <linux/hash.h>
#include "delegation.h"
#include "iostat.h"
@@ -55,7 +55,7 @@ static int nfs_closedir(struct inode *, struct file *);
static int nfs_readdir(struct file *, struct dir_context *);
static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
static loff_t nfs_llseek_dir(struct file *, loff_t, int);
-static void nfs_readdir_clear_array(struct page*);
+static void nfs_readdir_free_folio(struct folio *);
const struct file_operations nfs_dir_operations = {
.llseek = nfs_llseek_dir,
@@ -67,7 +67,7 @@ const struct file_operations nfs_dir_operations = {
};
const struct address_space_operations nfs_dir_aops = {
- .freepage = nfs_readdir_clear_array,
+ .free_folio = nfs_readdir_free_folio,
};
#define NFS_INIT_DTSIZE PAGE_SIZE
@@ -228,6 +228,11 @@ static void nfs_readdir_clear_array(struct page *page)
kunmap_atomic(array);
}
+static void nfs_readdir_free_folio(struct folio *folio)
+{
+ nfs_readdir_clear_array(&folio->page);
+}
+
static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie,
u64 change_attr)
{
@@ -350,10 +355,7 @@ out:
* of directory cookies. Content is addressed by the value of the
* cookie index of the first readdir entry in a page.
*
- * The xxhash algorithm is chosen because it is fast, and is supposed
- * to result in a decent flat distribution of hashes.
- *
- * We then select only the first 18 bits to avoid issues with excessive
+ * We select only the first 18 bits to avoid issues with excessive
* memory use for the page cache XArray. 18 bits should allow the caching
* of 262144 pages of sequences of readdir entries. Since each page holds
* 127 readdir entries for a typical 64-bit system, that works out to a
@@ -363,7 +365,7 @@ static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
{
if (cookie == 0)
return 0;
- return xxhash(&cookie, sizeof(cookie), 0) & NFS_READDIR_COOKIE_MASK;
+ return hash_64(cookie, 18);
}
static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
@@ -1082,7 +1084,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
struct nfs_cache_array *array;
unsigned int i;
- array = kmap(desc->page);
+ array = kmap_local_page(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
struct nfs_cache_array_entry *ent;
@@ -1108,7 +1110,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
if (array->page_is_eof)
desc->eof = !desc->eob;
- kunmap(desc->page);
+ kunmap_local(array);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n",
(unsigned long long)desc->dir_cookie);
}
@@ -1737,6 +1739,10 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
goto out_bad;
}
+ if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 &&
+ nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+ goto out_bad;
+
if (nfs_verifier_is_delegated(dentry))
return nfs_lookup_revalidate_delegated(dir, dentry, inode);
@@ -1776,6 +1782,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
int ret;
if (flags & LOOKUP_RCU) {
+ if (dentry->d_fsdata == NFS_FSDATA_BLOCKED)
+ return -ECHILD;
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
@@ -1784,6 +1792,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
} else {
+ /* Wait for unlink to complete */
+ wait_var_event(&dentry->d_fsdata,
+ dentry->d_fsdata != NFS_FSDATA_BLOCKED);
parent = dget_parent(dentry);
ret = reval(d_inode(parent), dentry, flags);
dput(parent);
@@ -1991,16 +2002,6 @@ const struct dentry_operations nfs4_dentry_operations = {
};
EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
-static fmode_t flags_to_mode(int flags)
-{
- fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
- if ((flags & O_ACCMODE) != O_WRONLY)
- res |= FMODE_READ;
- if ((flags & O_ACCMODE) != O_RDONLY)
- res |= FMODE_WRITE;
- return res;
-}
-
static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
{
return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
@@ -2021,7 +2022,7 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
err = finish_open(file, dentry, do_open);
if (err)
goto out;
- if (S_ISREG(file->f_path.dentry->d_inode->i_mode))
+ if (S_ISREG(file_inode(file)->i_mode))
nfs_file_set_open_context(file, ctx);
else
err = -EOPENSTALE;
@@ -2132,6 +2133,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
goto out;
}
+ file->f_mode |= FMODE_CAN_ODIRECT;
err = nfs_finish_open(ctx, ctx->dentry, file, open_flags);
trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
@@ -2380,7 +2382,8 @@ static void nfs_dentry_remove_handle_error(struct inode *dir,
{
switch (error) {
case -ENOENT:
- d_delete(dentry);
+ if (d_really_is_positive(dentry))
+ d_delete(dentry);
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
break;
case 0:
@@ -2461,7 +2464,6 @@ out:
int nfs_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
- int need_rehash = 0;
dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
dir->i_ino, dentry);
@@ -2476,15 +2478,27 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
error = nfs_sillyrename(dir, dentry);
goto out;
}
- if (!d_unhashed(dentry)) {
- __d_drop(dentry);
- need_rehash = 1;
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
+ */
+ error = -ETXTBSY;
+ if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) {
+ spin_unlock(&dentry->d_lock);
+ goto out;
}
+ if (dentry->d_fsdata)
+ /* old devname */
+ kfree(dentry->d_fsdata);
+ dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
nfs_dentry_remove_handle_error(dir, dentry, error);
- if (need_rehash)
- d_rehash(dentry);
+ dentry->d_fsdata = NULL;
+ wake_up_var(&dentry->d_fsdata);
out:
trace_nfs_unlink_exit(dir, dentry, error);
return error;
@@ -2591,6 +2605,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL_GPL(nfs_link);
+static void
+nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *new_dentry = data->new_dentry;
+
+ new_dentry->d_fsdata = NULL;
+ wake_up_var(&new_dentry->d_fsdata);
+}
+
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2621,8 +2644,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct dentry *dentry = NULL, *rehash = NULL;
+ struct dentry *dentry = NULL;
struct rpc_task *task;
+ bool must_unblock = false;
int error = -EBUSY;
if (flags)
@@ -2640,18 +2664,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
* the new target.
*/
if (new_inode && !S_ISDIR(new_inode->i_mode)) {
- /*
- * To prevent any new references to the target during the
- * rename, we unhash the dentry in advance.
+ /* We must prevent any concurrent open until the unlink
+ * completes. ->d_revalidate will wait for ->d_fsdata
+ * to clear. We set it here to ensure no lookup succeeds until
+ * the unlink is complete on the server.
*/
- if (!d_unhashed(new_dentry)) {
- d_drop(new_dentry);
- rehash = new_dentry;
+ error = -ETXTBSY;
+ if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
+ WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
+ goto out;
+ if (new_dentry->d_fsdata) {
+ /* old devname */
+ kfree(new_dentry->d_fsdata);
+ new_dentry->d_fsdata = NULL;
}
+ spin_lock(&new_dentry->d_lock);
if (d_count(new_dentry) > 2) {
int err;
+ spin_unlock(&new_dentry->d_lock);
+
/* copy the target dentry's name */
dentry = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
@@ -2664,14 +2697,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
goto out;
new_dentry = dentry;
- rehash = NULL;
new_inode = NULL;
+ } else {
+ new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ must_unblock = true;
+ spin_unlock(&new_dentry->d_lock);
}
+
}
if (S_ISREG(old_inode->i_mode))
nfs_sync_inode(old_inode);
- task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
+ must_unblock ? nfs_unblock_rename : NULL);
if (IS_ERR(task)) {
error = PTR_ERR(task);
goto out;
@@ -2695,8 +2733,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
spin_unlock(&old_inode->i_lock);
}
out:
- if (rehash)
- d_rehash(rehash);
trace_nfs_rename_exit(old_dir, old_dentry,
new_dir, new_dentry, error);
if (!error) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 11c566d8769f..1707f46b1335 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -60,44 +60,12 @@
#include "iostat.h"
#include "pnfs.h"
#include "fscache.h"
+#include "nfstrace.h"
#define NFSDBG_FACILITY NFSDBG_VFS
static struct kmem_cache *nfs_direct_cachep;
-struct nfs_direct_req {
- struct kref kref; /* release manager */
-
- /* I/O parameters */
- struct nfs_open_context *ctx; /* file open context info */
- struct nfs_lock_context *l_ctx; /* Lock context info */
- struct kiocb * iocb; /* controlling i/o request */
- struct inode * inode; /* target file of i/o */
-
- /* completion state */
- atomic_t io_count; /* i/os we're waiting for */
- spinlock_t lock; /* protect completion state */
-
- loff_t io_start; /* Start offset for I/O */
- ssize_t count, /* bytes actually processed */
- max_count, /* max expected count */
- bytes_left, /* bytes left to be sent */
- error; /* any reported error */
- struct completion completion; /* wait for i/o completion */
-
- /* commit state */
- struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */
- struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
- struct work_struct work;
- int flags;
- /* for write */
-#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
-#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
- /* for read */
-#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
-#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
-};
-
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
@@ -153,28 +121,25 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
}
/**
- * nfs_direct_IO - NFS address space operation for direct I/O
+ * nfs_swap_rw - NFS address space operation for swap I/O
* @iocb: target I/O control block
* @iter: I/O buffer
*
- * The presence of this routine in the address space ops vector means
- * the NFS client supports direct I/O. However, for most direct IO, we
- * shunt off direct read and write requests before the VFS gets them,
- * so this method is only ever called for swap.
+ * Perform IO to the swap-file. This is much like direct IO.
*/
-ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
-
- /* we only support swap file calling nfs_direct_IO */
- if (!IS_SWAPFILE(inode))
- return 0;
+ ssize_t ret;
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter, true);
- return nfs_file_direct_write(iocb, iter, true);
+ ret = nfs_file_direct_read(iocb, iter, true);
+ else
+ ret = nfs_file_direct_write(iocb, iter, true);
+ if (ret < 0)
+ return ret;
+ return 0;
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -367,13 +332,12 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
size_t pgbase;
unsigned npages, i;
- result = iov_iter_get_pages_alloc(iter, &pagevec,
+ result = iov_iter_get_pages_alloc2(iter, &pagevec,
rsize, &pgbase);
if (result < 0)
break;
bytes = result;
- iov_iter_advance(iter, bytes);
npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
for (i = 0; i < npages; i++) {
struct nfs_page *req;
@@ -481,7 +445,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- if (iter_is_iovec(iter))
+ if (user_backed_iter(iter))
dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
if (!swap)
@@ -598,14 +562,17 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
struct nfs_page *req;
int status = data->task.tk_status;
+ trace_nfs_direct_commit_complete(dreq);
+
if (status < 0) {
/* Errors in commit are fatal */
dreq->error = status;
dreq->max_count = 0;
dreq->count = 0;
dreq->flags = NFS_ODIRECT_DONE;
- } else if (dreq->flags == NFS_ODIRECT_DONE)
+ } else {
status = dreq->error;
+ }
nfs_init_cinfo_from_dreq(&cinfo, dreq);
@@ -634,6 +601,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
{
struct nfs_direct_req *dreq = cinfo->dreq;
+ trace_nfs_direct_resched_write(dreq);
+
spin_lock(&dreq->lock);
if (dreq->flags != NFS_ODIRECT_DONE)
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
@@ -698,6 +667,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
{
+ trace_nfs_direct_write_complete(dreq);
queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
}
@@ -708,6 +678,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
int flags = NFS_ODIRECT_DONE;
+ trace_nfs_direct_write_completion(dreq);
+
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
@@ -717,7 +689,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+ if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) {
if (!dreq->flags)
dreq->flags = NFS_ODIRECT_DO_COMMIT;
flags = dreq->flags;
@@ -762,6 +734,8 @@ static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
+ trace_nfs_direct_write_reschedule_io(dreq);
+
spin_lock(&dreq->lock);
if (dreq->error == 0) {
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
@@ -802,6 +776,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+ trace_nfs_direct_write_schedule_iovec(dreq);
+
nfs_pageio_init_write(&desc, inode, ioflags, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
@@ -815,13 +791,12 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
size_t pgbase;
unsigned npages, i;
- result = iov_iter_get_pages_alloc(iter, &pagevec,
+ result = iov_iter_get_pages_alloc2(iter, &pagevec,
wsize, &pgbase);
if (result < 0)
break;
bytes = result;
- iov_iter_advance(iter, bytes);
npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
for (i = 0; i < npages; i++) {
struct nfs_page *req;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 150b7fa8f0a7..e032fe201a36 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp)
return res;
res = nfs_open(inode, filp);
+ if (res == 0)
+ filp->f_mode |= FMODE_CAN_ODIRECT;
return res;
}
@@ -204,22 +206,25 @@ static int
nfs_file_fsync_commit(struct file *file, int datasync)
{
struct inode *inode = file_inode(file);
- int ret;
+ int ret, ret2;
dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
ret = nfs_commit_inode(inode, FLUSH_SYNC);
- if (ret < 0)
- return ret;
- return file_check_and_advance_wb_err(file);
+ ret2 = file_check_and_advance_wb_err(file);
+ if (ret2 < 0)
+ return ret2;
+ return ret;
}
int
nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = file_inode(file);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ long nredirtied;
int ret;
trace_nfs_fsync_enter(inode);
@@ -234,15 +239,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = pnfs_sync_inode(inode, !!datasync);
if (ret != 0)
break;
- if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
+ nredirtied = atomic_long_read(&nfsi->redirtied_pages);
+ if (nredirtied == save_nredirtied)
break;
- /*
- * If nfs_file_fsync_commit detected a server reboot, then
- * resend all dirty pages that might have been covered by
- * the NFS_CONTEXT_RESEND_WRITES flag
- */
- start = 0;
- end = LLONG_MAX;
+ save_nredirtied = nredirtied;
}
trace_nfs_fsync_exit(inode, ret);
@@ -313,7 +313,7 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page,
* increment the page use counts until he is done with the page.
*/
static int nfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
@@ -325,7 +325,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file, mapping->host->i_ino, len, (long long) pos);
start:
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -337,7 +337,7 @@ start:
} else if (!once_thru &&
nfs_want_read_modify_write(file, page, pos, len)) {
once_thru = 1;
- ret = nfs_readpage(file, page);
+ ret = nfs_read_folio(file, page_folio(page));
put_page(page);
if (!ret)
goto start;
@@ -385,11 +385,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
return status;
NFS_I(mapping->host)->write_io += copied;
- if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
- status = nfs_wb_all(mapping->host);
- if (status < 0)
- return status;
- }
+ if (nfs_ctx_key_to_expire(ctx, mapping->host))
+ nfs_wb_all(mapping->host);
return copied;
}
@@ -415,34 +412,31 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
}
/*
- * Attempt to release the private state associated with a page
- * - Called if either PG_private or PG_fscache is set on the page
- * - Caller holds page lock
- * - Return true (may release page) or false (may not)
+ * Attempt to release the private state associated with a folio
+ * - Called if either private or fscache flags are set on the folio
+ * - Caller holds folio lock
+ * - Return true (may release folio) or false (may not)
*/
-static int nfs_release_page(struct page *page, gfp_t gfp)
+static bool nfs_release_folio(struct folio *folio, gfp_t gfp)
{
- dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+ dfprintk(PAGECACHE, "NFS: release_folio(%p)\n", folio);
- /* If PagePrivate() is set, then the page is not freeable */
- if (PagePrivate(page))
- return 0;
- return nfs_fscache_release_page(page, gfp);
+ /* If the private flag is set, then the folio is not freeable */
+ if (folio_test_private(folio))
+ return false;
+ return nfs_fscache_release_folio(folio, gfp);
}
-static void nfs_check_dirty_writeback(struct page *page,
+static void nfs_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct nfs_inode *nfsi;
- struct address_space *mapping = page_file_mapping(page);
-
- if (!mapping || PageSwapCache(page))
- return;
+ struct address_space *mapping = folio->mapping;
/*
- * Check if an unstable page is currently being committed and
- * if so, have the VM treat it as if the page is under writeback
- * so it will not block due to pages that will shortly be freeable.
+ * Check if an unstable folio is currently being committed and
+ * if so, have the VM treat it as if the folio is under writeback
+ * so it will not block due to folios that will shortly be freeable.
*/
nfsi = NFS_I(mapping->host);
if (atomic_read(&nfsi->commit_info.rpcs_out)) {
@@ -451,11 +445,11 @@ static void nfs_check_dirty_writeback(struct page *page,
}
/*
- * If PagePrivate() is set, then the page is not freeable and as the
- * inode is not being committed, it's not going to be cleaned in the
- * near future so treat it as dirty
+ * If the private flag is set, then the folio is not freeable
+ * and as the inode is not being committed, it's not going to
+ * be cleaned in the near future so treat it as dirty
*/
- if (PagePrivate(page))
+ if (folio_test_private(folio))
*dirty = true;
}
@@ -483,6 +477,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
{
unsigned long blocks;
long long isize;
+ int ret;
struct inode *inode = file_inode(file);
struct rpc_clnt *clnt = NFS_CLIENT(inode);
struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
@@ -496,13 +491,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
return -EINVAL;
}
- *span = sis->pages;
+ ret = rpc_clnt_swap_activate(clnt);
+ if (ret)
+ return ret;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ if (ret < 0) {
+ rpc_clnt_swap_deactivate(clnt);
+ return ret;
+ }
+ *span = sis->pages;
if (cl->rpc_ops->enable_swap)
cl->rpc_ops->enable_swap(inode);
- return rpc_clnt_swap_activate(clnt);
+ sis->flags |= SWP_FS_OPS;
+ return ret;
}
static void nfs_swap_deactivate(struct file *file)
@@ -517,7 +521,7 @@ static void nfs_swap_deactivate(struct file *file)
}
const struct address_space_operations nfs_file_aops = {
- .readpage = nfs_readpage,
+ .read_folio = nfs_read_folio,
.readahead = nfs_readahead,
.dirty_folio = filemap_dirty_folio,
.writepage = nfs_writepage,
@@ -525,16 +529,14 @@ const struct address_space_operations nfs_file_aops = {
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
.invalidate_folio = nfs_invalidate_folio,
- .releasepage = nfs_release_page,
- .direct_IO = nfs_direct_IO,
-#ifdef CONFIG_MIGRATION
- .migratepage = nfs_migrate_page,
-#endif
+ .release_folio = nfs_release_folio,
+ .migrate_folio = nfs_migrate_folio,
.launder_folio = nfs_launder_folio,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
.swap_activate = nfs_swap_activate,
.swap_deactivate = nfs_swap_deactivate,
+ .swap_rw = nfs_swap_rw,
};
/*
@@ -597,18 +599,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_check_write(struct file *filp, struct inode *inode,
- int error)
-{
- struct nfs_open_context *ctx;
-
- ctx = nfs_file_open_context(filp);
- if (nfs_error_is_fatal_on_server(error) ||
- nfs_ctx_key_to_expire(ctx, inode))
- return 1;
- return 0;
-}
-
ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -636,7 +626,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) {
result = nfs_revalidate_file_size(inode, file);
if (result)
- goto out;
+ return result;
}
nfs_clear_invalid_mapping(file->f_mapping);
@@ -655,6 +645,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
written = result;
iocb->ki_pos += written;
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
if (mntflags & NFS_MOUNT_WRITE_EAGER) {
result = filemap_fdatawrite_range(file->f_mapping,
@@ -667,22 +658,25 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
result = filemap_fdatawait_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
- if (result < 0)
- goto out;
}
result = generic_write_sync(iocb, written);
if (result < 0)
- goto out;
+ return result;
+out:
/* Return error values */
error = filemap_check_wb_err(file->f_mapping, since);
- if (nfs_need_check_write(file, inode, error)) {
- int err = nfs_wb_all(inode);
- if (err < 0)
- result = err;
+ switch (error) {
+ default:
+ break;
+ case -EDQUOT:
+ case -EFBIG:
+ case -ENOSPC:
+ nfs_wb_all(inode);
+ error = file_check_and_advance_wb_err(file);
+ if (error < 0)
+ result = error;
}
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
-out:
return result;
out_swapfile:
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 76deddab0a8f..ad34a33b0737 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -181,6 +181,8 @@ static int filelayout_async_handle_error(struct rpc_task *task,
case -EIO:
case -ETIMEDOUT:
case -EPIPE:
+ case -EPROTO:
+ case -ENODEV:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
@@ -839,7 +841,12 @@ fl_pnfs_update_layout(struct inode *ino,
lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
gfp_flags);
- if (IS_ERR_OR_NULL(lseg))
+ if (IS_ERR(lseg)) {
+ /* Fall back to MDS on recoverable errors */
+ if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg)))
+ lseg = NULL;
+ goto out;
+ } else if (!lseg)
goto out;
lo = NFS_I(ino)->layout;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 604be402ae13..7d285561e59f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1131,6 +1131,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
case -EIO:
case -ETIMEDOUT:
case -EPIPE:
+ case -EPROTO:
+ case -ENODEV:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
@@ -1236,6 +1238,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -ENOBUFS:
case -EPIPE:
case -EPERM:
+ case -EPROTO:
+ case -ENODEV:
*op_status = status = NFS4ERR_NXIO;
break;
case -EACCES:
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index bfa7202ca7be..e028f5a0ef5f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -113,8 +113,10 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out_err_drain_dsaddrs;
ds_versions[i].version = be32_to_cpup(p++);
ds_versions[i].minor_version = be32_to_cpup(p++);
- ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
- ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+ ds_versions[i].rsize = nfs_io_size(be32_to_cpup(p++),
+ server->nfs_client->cl_proto);
+ ds_versions[i].wsize = nfs_io_size(be32_to_cpup(p++),
+ server->nfs_client->cl_proto);
ds_versions[i].tightly_coupled = be32_to_cpup(p);
if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e2d59bb5e6bb..4da701fd1424 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -21,6 +21,8 @@
#include "nfs.h"
#include "internal.h"
+#include "nfstrace.h"
+
#define NFSDBG_FACILITY NFSDBG_MOUNT
#if IS_ENABLED(CONFIG_NFS_V3)
@@ -284,7 +286,6 @@ static int nfs_verify_server_address(struct sockaddr *addr)
}
}
- dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
return 0;
}
@@ -378,7 +379,7 @@ static int nfs_parse_security_flavors(struct fs_context *fc,
char *string = param->string, *p;
int ret;
- dfprintk(MOUNT, "NFS: parsing %s=%s option\n", param->key, param->string);
+ trace_nfs_mount_assign(param->key, string);
while ((p = strsep(&string, ":")) != NULL) {
if (!*p)
@@ -480,11 +481,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
unsigned int len;
int ret, opt;
- dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", param->key);
+ trace_nfs_mount_option(param);
opt = fs_parse(fc, nfs_fs_parameters, param, &result);
if (opt < 0)
- return ctx->sloppy ? 1 : opt;
+ return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt;
if (fc->security)
ctx->has_sec_mnt_opts = 1;
@@ -517,7 +518,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
if (result.negated)
ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
else
- ctx->flags &= NFS_MOUNT_SOFTREVAL;
+ ctx->flags |= NFS_MOUNT_SOFTREVAL;
break;
case Opt_posix:
if (result.negated)
@@ -683,6 +684,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
return ret;
break;
case Opt_vers:
+ trace_nfs_mount_assign(param->key, param->string);
ret = nfs_parse_version_string(fc, param->string);
if (ret < 0)
return ret;
@@ -694,6 +696,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_proto:
+ trace_nfs_mount_assign(param->key, param->string);
protofamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
case Opt_xprt_udp6:
@@ -729,6 +732,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_mountproto:
+ trace_nfs_mount_assign(param->key, param->string);
mountfamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
case Opt_xprt_udp6:
@@ -751,6 +755,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_addr:
+ trace_nfs_mount_assign(param->key, param->string);
len = rpc_pton(fc->net_ns, param->string, param->size,
&ctx->nfs_server.address,
sizeof(ctx->nfs_server._address));
@@ -759,16 +764,19 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
ctx->nfs_server.addrlen = len;
break;
case Opt_clientaddr:
+ trace_nfs_mount_assign(param->key, param->string);
kfree(ctx->client_address);
ctx->client_address = param->string;
param->string = NULL;
break;
case Opt_mounthost:
+ trace_nfs_mount_assign(param->key, param->string);
kfree(ctx->mount_server.hostname);
ctx->mount_server.hostname = param->string;
param->string = NULL;
break;
case Opt_mountaddr:
+ trace_nfs_mount_assign(param->key, param->string);
len = rpc_pton(fc->net_ns, param->string, param->size,
&ctx->mount_server.address,
sizeof(ctx->mount_server._address));
@@ -846,7 +854,6 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
*/
case Opt_sloppy:
ctx->sloppy = true;
- dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
break;
}
@@ -879,10 +886,8 @@ static int nfs_parse_source(struct fs_context *fc,
size_t len;
const char *end;
- if (unlikely(!dev_name || !*dev_name)) {
- dfprintk(MOUNT, "NFS: device name not specified\n");
+ if (unlikely(!dev_name || !*dev_name))
return -EINVAL;
- }
/* Is the host name protected with square brakcets? */
if (*dev_name == '[') {
@@ -922,7 +927,7 @@ static int nfs_parse_source(struct fs_context *fc,
if (!ctx->nfs_server.export_path)
goto out_nomem;
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", ctx->nfs_server.export_path);
+ trace_nfs_mount_path(ctx->nfs_server.export_path);
return 0;
out_bad_devname:
@@ -1116,7 +1121,6 @@ out_no_sec:
return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS");
out_nomem:
- dfprintk(MOUNT, "NFS: not enough memory to handle mount options");
return -ENOMEM;
out_no_address:
@@ -1248,7 +1252,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
if (IS_ERR(c))
return PTR_ERR(c);
ctx->nfs_server.export_path = c;
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+ trace_nfs_mount_path(c);
c = strndup_user(data->client_addr.data, 16);
if (IS_ERR(c))
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index f73c09a9cf0a..e861d7bae305 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -231,11 +231,10 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp)
{
struct nfs_fscache_inode_auxdata auxdata;
struct fscache_cookie *cookie = nfs_i_fscache(inode);
+ loff_t i_size = i_size_read(inode);
- if (fscache_cookie_valid(cookie)) {
- nfs_fscache_update_auxdata(&auxdata, inode);
- fscache_unuse_cookie(cookie, &auxdata, NULL);
- }
+ nfs_fscache_update_auxdata(&auxdata, inode);
+ fscache_unuse_cookie(cookie, &auxdata, &i_size);
}
/*
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 4e980cc04779..2a37af880978 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -48,14 +48,14 @@ extern void nfs_fscache_release_file(struct inode *, struct file *);
extern int __nfs_fscache_read_page(struct inode *, struct page *);
extern void __nfs_fscache_write_page(struct inode *, struct page *);
-static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
{
- if (PageFsCache(page)) {
+ if (folio_test_fscache(folio)) {
if (current_is_kswapd() || !(gfp & __GFP_FS))
return false;
- wait_on_page_fscache(page);
- fscache_note_page_release(nfs_i_fscache(page->mapping->host));
- nfs_inc_fscache_stats(page->mapping->host,
+ folio_wait_fscache(folio);
+ fscache_note_page_release(nfs_i_fscache(folio->mapping->host));
+ nfs_inc_fscache_stats(folio->mapping->host,
NFSIOS_FSCACHE_PAGES_UNCACHED);
}
return true;
@@ -129,9 +129,9 @@ static inline void nfs_fscache_open_file(struct inode *inode,
struct file *filp) {}
static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
-static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
{
- return 1; /* True: may release page */
+ return true; /* may release folio */
}
static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7eb3b08d702f..bea7c005119c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -426,6 +426,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
static void nfs_inode_init_regular(struct nfs_inode *nfsi)
{
atomic_long_set(&nfsi->nrequests, 0);
+ atomic_long_set(&nfsi->redirtied_pages, 0);
INIT_LIST_HEAD(&nfsi->commit_info.list);
atomic_long_set(&nfsi->commit_info.ncommit, 0);
atomic_set(&nfsi->commit_info.rpcs_out, 0);
@@ -1180,7 +1181,6 @@ int nfs_open(struct inode *inode, struct file *filp)
nfs_fscache_open_file(inode, filp);
return 0;
}
-EXPORT_SYMBOL_GPL(nfs_open);
/*
* This function is called whenever some part of NFS notices that
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 57b0497105c8..898dd95bc7a7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -42,6 +42,16 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry)
return true;
}
+static inline fmode_t flags_to_mode(int flags)
+{
+ fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+ if ((flags & O_ACCMODE) != O_WRONLY)
+ res |= FMODE_READ;
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ res |= FMODE_WRITE;
+ return res;
+}
+
/*
* Note: RFC 1813 doesn't limit the number of auth flavors that
* a server can return, so make something up.
@@ -568,8 +578,10 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
#endif
#ifdef CONFIG_MIGRATION
-extern int nfs_migrate_page(struct address_space *,
- struct page *, struct page *, enum migrate_mode);
+int nfs_migrate_folio(struct address_space *, struct folio *dst,
+ struct folio *src, enum migrate_mode);
+#else
+#define nfs_migrate_folio NULL
#endif
static inline int
@@ -594,6 +606,31 @@ static inline gfp_t nfs_io_gfp_mask(void)
return GFP_KERNEL;
}
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static inline int nfs_should_remove_suid(const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@@ -695,6 +732,24 @@ unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
}
/*
+ * Compute and set NFS server rsize / wsize
+ */
+static inline
+unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto)
+{
+ if (iosize < NFS_MIN_FILE_IO_SIZE)
+ iosize = NFS_DEF_FILE_IO_SIZE;
+ else if (iosize >= NFS_MAX_FILE_IO_SIZE)
+ iosize = NFS_MAX_FILE_IO_SIZE;
+ else
+ iosize = iosize & PAGE_MASK;
+
+ if (proto == XPRT_TRANSPORT_UDP)
+ return nfs_block_bits(iosize, NULL);
+ return iosize;
+}
+
+/*
* Determine the maximum file size for a superblock
*/
static inline
@@ -831,6 +886,7 @@ static inline bool nfs_error_is_fatal_on_server(int err)
case 0:
case -ERESTARTSYS:
case -EINTR:
+ case -ENOMEM:
return false;
}
return nfs_error_is_fatal(err);
@@ -848,3 +904,36 @@ static inline void nfs_set_port(struct sockaddr *sap, int *port,
rpc_set_port(sap, *port);
}
+
+struct nfs_direct_req {
+ struct kref kref; /* release manager */
+
+ /* I/O parameters */
+ struct nfs_open_context *ctx; /* file open context info */
+ struct nfs_lock_context *l_ctx; /* Lock context info */
+ struct kiocb * iocb; /* controlling i/o request */
+ struct inode * inode; /* target file of i/o */
+
+ /* completion state */
+ atomic_t io_count; /* i/os we're waiting for */
+ spinlock_t lock; /* protect completion state */
+
+ loff_t io_start; /* Start offset for I/O */
+ ssize_t count, /* bytes actually processed */
+ max_count, /* max expected count */
+ bytes_left, /* bytes left to be sent */
+ error; /* any reported error */
+ struct completion completion; /* wait for i/o completion */
+
+ /* commit state */
+ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */
+ struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
+ struct work_struct work;
+ int flags;
+ /* for write */
+#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
+#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
+ /* for read */
+#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
+#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
+};
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 5601e47360c2..b49359afac88 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -108,7 +108,6 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
- __set_bit(NFS_CS_NOPING, &cl_init.init_flags);
__set_bit(NFS_CS_DS, &cl_init.init_flags);
/* Use the MDS nfs_client cl_ipaddr. */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 068c45b3bc1a..d37e4a5401b1 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -78,10 +78,15 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
status = nfs4_call_sync(server->client, server, msg,
&args.seq_args, &res.seq_res, 0);
- if (status == 0)
+ if (status == 0) {
+ if (nfs_should_remove_suid(inode)) {
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ spin_unlock(&inode->i_lock);
+ }
status = nfs_post_op_update_inode_force_wcc(inode,
res.falloc_fattr);
-
+ }
if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE])
trace_nfs4_fallocate(inode, &args, status);
else
@@ -336,7 +341,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
return status;
}
}
- status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+ status = nfs_filemap_write_and_wait_range(src->f_mapping,
pos_src, pos_src + (loff_t)count - 1);
if (status)
return status;
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index ad3405c64b9e..a9bf09fdf2c3 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -997,7 +997,7 @@ int __init nfs4_xattr_cache_init(void)
nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
sizeof(struct nfs4_xattr_cache), 0,
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
nfs4_xattr_cache_init_once);
if (nfs4_xattr_cache_cachep == NULL)
return -ENOMEM;
@@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void)
if (ret)
goto out2;
- ret = register_shrinker(&nfs4_xattr_cache_shrinker);
+ ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache");
if (ret)
goto out1;
- ret = register_shrinker(&nfs4_xattr_entry_shrinker);
+ ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry");
if (ret)
goto out;
- ret = register_shrinker(&nfs4_xattr_large_entry_shrinker);
+ ret = register_shrinker(&nfs4_xattr_large_entry_shrinker,
+ "nfs-xattr_large_entry");
if (!ret)
return 0;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 271e5f92ed01..b56f05113d36 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1025,82 +1025,95 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
return decode_op_hdr(xdr, OP_DEALLOCATE);
}
-static int decode_read_plus_data(struct xdr_stream *xdr,
- struct nfs_pgio_args *args,
- struct nfs_pgio_res *res)
-{
- uint32_t count, recvd;
+struct read_plus_segment {
+ enum data_content4 type;
uint64_t offset;
- __be32 *p;
-
- p = xdr_inline_decode(xdr, 8 + 4);
- if (!p)
- return 1;
+ union {
+ struct {
+ uint64_t length;
+ } hole;
+
+ struct {
+ uint32_t length;
+ unsigned int from;
+ } data;
+ };
+};
- p = xdr_decode_hyper(p, &offset);
- count = be32_to_cpup(p);
- recvd = xdr_align_data(xdr, res->count, xdr_align_size(count));
- if (recvd > count)
- recvd = count;
- if (res->count + recvd > args->count) {
- if (args->count > res->count)
- res->count += args->count - res->count;
- return 1;
- }
- res->count += recvd;
- if (count > recvd)
- return 1;
- return 0;
+static inline uint64_t read_plus_segment_length(struct read_plus_segment *seg)
+{
+ return seg->type == NFS4_CONTENT_DATA ? seg->data.length : seg->hole.length;
}
-static int decode_read_plus_hole(struct xdr_stream *xdr,
- struct nfs_pgio_args *args,
- struct nfs_pgio_res *res, uint32_t *eof)
+static int decode_read_plus_segment(struct xdr_stream *xdr,
+ struct read_plus_segment *seg)
{
- uint64_t offset, length, recvd;
__be32 *p;
- p = xdr_inline_decode(xdr, 8 + 8);
+ p = xdr_inline_decode(xdr, 4);
if (!p)
- return 1;
-
- p = xdr_decode_hyper(p, &offset);
- p = xdr_decode_hyper(p, &length);
- if (offset != args->offset + res->count) {
- /* Server returned an out-of-sequence extent */
- if (offset > args->offset + res->count ||
- offset + length < args->offset + res->count) {
- dprintk("NFS: server returned out of sequence extent: "
- "offset/size = %llu/%llu != expected %llu\n",
- (unsigned long long)offset,
- (unsigned long long)length,
- (unsigned long long)(args->offset +
- res->count));
- return 1;
- }
- length -= args->offset + res->count - offset;
- }
- if (length + res->count > args->count) {
- *eof = 0;
- if (unlikely(res->count >= args->count))
- return 1;
- length = args->count - res->count;
- }
- recvd = xdr_expand_hole(xdr, res->count, length);
- res->count += recvd;
+ return -EIO;
+ seg->type = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, seg->type == NFS4_CONTENT_DATA ? 12 : 16);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &seg->offset);
+
+ if (seg->type == NFS4_CONTENT_DATA) {
+ struct xdr_buf buf;
+ uint32_t len = be32_to_cpup(p);
+
+ seg->data.length = len;
+ seg->data.from = xdr_stream_pos(xdr);
- if (recvd < length)
- return 1;
+ if (!xdr_stream_subsegment(xdr, &buf, xdr_align_size(len)))
+ return -EIO;
+ } else if (seg->type == NFS4_CONTENT_HOLE) {
+ xdr_decode_hyper(p, &seg->hole.length);
+ } else
+ return -EINVAL;
return 0;
}
+static int process_read_plus_segment(struct xdr_stream *xdr,
+ struct nfs_pgio_args *args,
+ struct nfs_pgio_res *res,
+ struct read_plus_segment *seg)
+{
+ unsigned long offset = seg->offset;
+ unsigned long length = read_plus_segment_length(seg);
+ unsigned int bufpos;
+
+ if (offset + length < args->offset)
+ return 0;
+ else if (offset > args->offset + args->count) {
+ res->eof = 0;
+ return 0;
+ } else if (offset < args->offset) {
+ length -= (args->offset - offset);
+ offset = args->offset;
+ } else if (offset + length > args->offset + args->count) {
+ length = (args->offset + args->count) - offset;
+ res->eof = 0;
+ }
+
+ bufpos = xdr->buf->head[0].iov_len + (offset - args->offset);
+ if (seg->type == NFS4_CONTENT_HOLE)
+ return xdr_stream_zero(xdr, bufpos, length);
+ else
+ return xdr_stream_move_subsegment(xdr, seg->data.from, bufpos, length);
+}
+
static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
{
struct nfs_pgio_header *hdr =
container_of(res, struct nfs_pgio_header, res);
struct nfs_pgio_args *args = &hdr->args;
- uint32_t eof, segments, type;
+ uint32_t segments;
+ struct read_plus_segment *segs;
int status, i;
+ char scratch_buf[16];
__be32 *p;
status = decode_op_hdr(xdr, OP_READ_PLUS);
@@ -1112,38 +1125,31 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
return -EIO;
res->count = 0;
- eof = be32_to_cpup(p++);
+ res->eof = be32_to_cpup(p++);
segments = be32_to_cpup(p++);
if (segments == 0)
- goto out;
-
- for (i = 0; i < segments; i++) {
- p = xdr_inline_decode(xdr, 4);
- if (!p)
- goto early_out;
+ return status;
- type = be32_to_cpup(p++);
- if (type == NFS4_CONTENT_DATA)
- status = decode_read_plus_data(xdr, args, res);
- else if (type == NFS4_CONTENT_HOLE)
- status = decode_read_plus_hole(xdr, args, res, &eof);
- else
- return -EINVAL;
+ segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL);
+ if (!segs)
+ return -ENOMEM;
+ xdr_set_scratch_buffer(xdr, &scratch_buf, 32);
+ status = -EIO;
+ for (i = 0; i < segments; i++) {
+ status = decode_read_plus_segment(xdr, &segs[i]);
if (status < 0)
- return status;
- if (status > 0)
- goto early_out;
+ goto out;
}
+ xdr_set_pagelen(xdr, xdr_align_size(args->count));
+ for (i = segments; i > 0; i--)
+ res->count += process_read_plus_segment(xdr, args, res, &segs[i-1]);
+ status = 0;
+
out:
- res->eof = eof;
- return 0;
-early_out:
- if (unlikely(!i))
- return -EIO;
- res->eof = 0;
- return 0;
+ kfree(segs);
+ return status;
}
static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 47a6cf892c95..3c5678aec006 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1161,9 +1161,9 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
return error;
if (ctx->rsize)
- server->rsize = nfs_block_size(ctx->rsize, NULL);
+ server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto);
if (ctx->wsize)
- server->wsize = nfs_block_size(ctx->wsize, NULL);
+ server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto);
server->acregmin = ctx->acregmin * HZ;
server->acregmax = ctx->acregmax * HZ;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index d258933cf8c8..9eb181287879 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -32,6 +32,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
+ fmode_t f_mode;
struct iattr attr;
int err;
@@ -50,8 +51,9 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
+ f_mode = filp->f_mode;
if ((openflags & O_ACCMODE) == 3)
- return nfs_open(inode, filp);
+ f_mode |= flags_to_mode(openflags);
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
@@ -59,7 +61,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -91,6 +93,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_file_set_open_context(filp, ctx);
nfs_fscache_open_file(inode, filp);
err = 0;
+ filp->f_mode |= FMODE_CAN_ODIRECT;
out_put_ctx:
put_nfs_open_context(ctx);
@@ -326,7 +329,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
char *read_name = NULL;
int len, status = 0;
- server = NFS_SERVER(ss_mnt->mnt_root->d_inode);
+ server = NFS_SB(ss_mnt->mnt_sb);
if (!fattr)
return ERR_PTR(-ENOMEM);
@@ -337,6 +340,11 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
goto out;
}
+ if (!S_ISREG(fattr->mode)) {
+ res = ERR_PTR(-EBADF);
+ goto out;
+ }
+
res = ERR_PTR(-ENOMEM);
len = strlen(SSC_READ_NAME_BODY) + 16;
read_name = kzalloc(len, GFP_KERNEL);
@@ -344,7 +352,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
goto out;
snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
- r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr);
+ r_ino = nfs_fhget(ss_mnt->mnt_sb, src_fh, fattr);
if (IS_ERR(r_ino)) {
res = ERR_CAST(r_ino);
goto out_free_name;
@@ -354,6 +362,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
r_ino->i_fop);
if (IS_ERR(filep)) {
res = ERR_CAST(filep);
+ iput(r_ino);
goto out_free_name;
}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index f331866dd418..ec6afd3c4bca 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -561,22 +561,20 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
return true;
}
-static void
-nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
+static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data,
+ int ret)
{
- struct key *authkey = idmap->idmap_upcall_data->authkey;
-
- kfree(idmap->idmap_upcall_data);
- idmap->idmap_upcall_data = NULL;
- complete_request_key(authkey, ret);
- key_put(authkey);
+ complete_request_key(data->authkey, ret);
+ key_put(data->authkey);
+ kfree(data);
}
-static void
-nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
+static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data,
+ int ret)
{
- if (idmap->idmap_upcall_data != NULL)
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data)
+ nfs_idmap_complete_pipe_upcall(data, ret);
}
static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
@@ -613,7 +611,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
if (ret < 0)
- nfs_idmap_abort_pipe_upcall(idmap, ret);
+ nfs_idmap_abort_pipe_upcall(idmap, data, ret);
return ret;
out2:
@@ -669,6 +667,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
@@ -678,10 +677,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
* will have been woken up and someone else may now have used
* idmap_key_cons - so after this point we may no longer touch it.
*/
- if (idmap->idmap_upcall_data == NULL)
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data == NULL)
goto out_noupcall;
- authkey = idmap->idmap_upcall_data->authkey;
+ authkey = data->authkey;
rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
@@ -703,18 +703,17 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
ret = -EINVAL;
goto out;
-}
+ }
- ret = nfs_idmap_read_and_verify_message(&im,
- &idmap->idmap_upcall_data->idmap_msg,
- rka->target_key, authkey);
+ ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg,
+ rka->target_key, authkey);
if (ret >= 0) {
key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
out:
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ nfs_idmap_complete_pipe_upcall(data, ret);
out_noupcall:
return ret;
}
@@ -728,7 +727,7 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
struct idmap *idmap = data->idmap;
if (msg->errno)
- nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
+ nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno);
}
static void
@@ -736,8 +735,11 @@ idmap_release_pipe(struct inode *inode)
{
struct rpc_inode *rpci = RPC_I(inode);
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
- nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data)
+ nfs_idmap_complete_pipe_upcall(data, -EPIPE);
}
int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3680c8da510c..f2dbf904c598 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -417,6 +417,9 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client)
fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
if (!fs_locations)
goto out_free;
+ fs_locations->fattr = nfs_alloc_fattr();
+ if (!fs_locations->fattr)
+ goto out_free_2;
/* Get locations */
dentry = ctx->clone_data.dentry;
@@ -427,14 +430,16 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client)
err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page);
dput(parent);
if (err != 0)
- goto out_free_2;
+ goto out_free_3;
err = -ENOENT;
if (fs_locations->nlocations <= 0 ||
fs_locations->fs_path.ncomponents <= 0)
- goto out_free_2;
+ goto out_free_3;
err = nfs_follow_referral(fc, fs_locations);
+out_free_3:
+ kfree(fs_locations->fattr);
out_free_2:
kfree(fs_locations);
out_free:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e3f5b380cefe..3ed14a2a84a4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start);
}
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
static void nfs4_test_and_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
const struct cred *cred)
@@ -776,10 +784,9 @@ static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
slot->seq_nr_highest_sent = seqnr;
}
-static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
- u32 seqnr)
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr)
{
- slot->seq_nr_highest_sent = seqnr;
+ nfs4_slot_sequence_record_sent(slot, seqnr);
slot->seq_nr_last_acked = seqnr;
}
@@ -846,7 +853,6 @@ static int nfs41_sequence_process(struct rpc_task *task,
__func__,
slot->slot_nr,
slot->seq_nr);
- nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto out_retry;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -1154,7 +1160,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
{
unsigned short task_flags = 0;
- if (server->nfs_client->cl_minorversion)
+ if (server->caps & NFS_CAP_MOVEABLE)
task_flags = RPC_TASK_MOVEABLE;
return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags);
}
@@ -2560,7 +2566,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
};
int status;
- if (server->nfs_client->cl_minorversion)
+ if (nfs_server_capable(dir, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
kref_get(&data->kref);
@@ -3090,8 +3096,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
}
out:
- if (!opendata->cancelled)
+ if (!opendata->cancelled) {
+ if (opendata->lgp) {
+ nfs4_lgopen_release(opendata->lgp);
+ opendata->lgp = NULL;
+ }
nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ }
return ret;
}
@@ -3725,7 +3736,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
};
int status = -ENOMEM;
- if (server->nfs_client->cl_minorversion)
+ if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP,
@@ -4000,22 +4011,29 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
}
page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
- if (page == NULL || locations == NULL)
- goto out;
+ if (!locations)
+ goto out_free;
+ locations->fattr = nfs_alloc_fattr();
+ if (!locations->fattr)
+ goto out_free_2;
status = nfs4_proc_get_locations(server, fhandle, locations, page,
cred);
if (status)
- goto out;
+ goto out_free_3;
for (i = 0; i < locations->nlocations; i++)
test_fs_location_for_trunking(&locations->locations[i], clp,
server);
-out:
- if (page)
- __free_page(page);
+out_free_3:
+ kfree(locations->fattr);
+out_free_2:
kfree(locations);
+out_free:
+ __free_page(page);
return status;
}
@@ -4235,6 +4253,8 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
if (locations == NULL)
goto out;
+ locations->fattr = fattr;
+
status = nfs4_proc_fs_locations(client, dir, name, locations, page);
if (status != 0)
goto out;
@@ -4244,17 +4264,14 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
* referral. Cause us to drop into the exception handler, which
* will kick off migration recovery.
*/
- if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) {
dprintk("%s: server did not return a different fsid for"
" a referral at %s\n", __func__, name->name);
status = -NFS4ERR_MOVED;
goto out;
}
/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
- nfs_fixup_referral_attributes(&locations->fattr);
-
- /* replace the lookup nfs_fattr with the locations nfs_fattr */
- memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+ nfs_fixup_referral_attributes(fattr);
memset(fhandle, 0, sizeof(struct nfs_fh));
out:
if (page)
@@ -4396,7 +4413,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
};
unsigned short task_flags = 0;
- if (server->nfs_client->cl_minorversion)
+ if (nfs_server_capable(dir, NFS_CAP_MOVEABLE))
task_flags = RPC_TASK_MOVEABLE;
/* Is this is an attribute revalidation, subject to softreval? */
@@ -5760,9 +5777,17 @@ static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred)
return 0;
}
-static inline int nfs4_server_supports_acls(struct nfs_server *server)
+static bool nfs4_server_supports_acls(const struct nfs_server *server,
+ enum nfs4_acl_type type)
{
- return server->caps & NFS_CAP_ACLS;
+ switch (type) {
+ default:
+ return server->attr_bitmask[0] & FATTR4_WORD0_ACL;
+ case NFS4ACL_DACL:
+ return server->attr_bitmask[1] & FATTR4_WORD1_DACL;
+ case NFS4ACL_SACL:
+ return server->attr_bitmask[1] & FATTR4_WORD1_SACL;
+ }
}
/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
@@ -5801,6 +5826,7 @@ unwind:
}
struct nfs4_cached_acl {
+ enum nfs4_acl_type type;
int cached;
size_t len;
char data[];
@@ -5821,7 +5847,8 @@ static void nfs4_zap_acl_attr(struct inode *inode)
nfs4_set_cached_acl(inode, NULL);
}
-static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen)
+static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs4_cached_acl *acl;
@@ -5831,6 +5858,8 @@ static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_
acl = nfsi->nfs4_acl;
if (acl == NULL)
goto out;
+ if (acl->type != type)
+ goto out;
if (buf == NULL) /* user is just asking for length */
goto out_len;
if (acl->cached == 0)
@@ -5846,7 +5875,9 @@ out:
return ret;
}
-static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len)
+static void nfs4_write_cached_acl(struct inode *inode, struct page **pages,
+ size_t pgbase, size_t acl_len,
+ enum nfs4_acl_type type)
{
struct nfs4_cached_acl *acl;
size_t buflen = sizeof(*acl) + acl_len;
@@ -5863,6 +5894,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size
goto out;
acl->cached = 0;
}
+ acl->type = type;
acl->len = acl_len;
out:
nfs4_set_cached_acl(inode, acl);
@@ -5878,14 +5910,17 @@ out:
* length. The next getxattr call will then produce another round trip to
* the server, this time with the input buf of the required size.
*/
-static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct page **pages;
struct nfs_getaclargs args = {
.fh = NFS_FH(inode),
+ .acl_type = type,
.acl_len = buflen,
};
struct nfs_getaclres res = {
+ .acl_type = type,
.acl_len = buflen,
};
struct rpc_message msg = {
@@ -5935,7 +5970,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
ret = -ERANGE;
goto out_free;
}
- nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
+ nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len,
+ type);
if (buf) {
if (res.acl_len > buflen) {
ret = -ERANGE;
@@ -5955,14 +5991,15 @@ out_free:
return ret;
}
-static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
+static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs4_exception exception = {
.interruptible = true,
};
ssize_t ret;
do {
- ret = __nfs4_get_acl_uncached(inode, buf, buflen);
+ ret = __nfs4_get_acl_uncached(inode, buf, buflen, type);
trace_nfs4_get_acl(inode, ret);
if (ret >= 0)
break;
@@ -5971,34 +6008,37 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl
return ret;
}
-static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
+static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen,
+ enum nfs4_acl_type type)
{
struct nfs_server *server = NFS_SERVER(inode);
int ret;
- if (!nfs4_server_supports_acls(server))
+ if (!nfs4_server_supports_acls(server, type))
return -EOPNOTSUPP;
ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
if (ret < 0)
return ret;
if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
nfs_zap_acl_cache(inode);
- ret = nfs4_read_cached_acl(inode, buf, buflen);
+ ret = nfs4_read_cached_acl(inode, buf, buflen, type);
if (ret != -ENOENT)
/* -ENOENT is returned if there is no ACL or if there is an ACL
* but no cached acl data, just the acl length */
return ret;
- return nfs4_get_acl_uncached(inode, buf, buflen);
+ return nfs4_get_acl_uncached(inode, buf, buflen, type);
}
-static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+static int __nfs4_proc_set_acl(struct inode *inode, const void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs_server *server = NFS_SERVER(inode);
struct page *pages[NFS4ACL_MAXPAGES];
struct nfs_setaclargs arg = {
- .fh = NFS_FH(inode),
- .acl_pages = pages,
- .acl_len = buflen,
+ .fh = NFS_FH(inode),
+ .acl_type = type,
+ .acl_len = buflen,
+ .acl_pages = pages,
};
struct nfs_setaclres res;
struct rpc_message msg = {
@@ -6012,7 +6052,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
/* You can't remove system.nfs4_acl: */
if (buflen == 0)
return -EINVAL;
- if (!nfs4_server_supports_acls(server))
+ if (!nfs4_server_supports_acls(server, type))
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
@@ -6043,12 +6083,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return ret;
}
-static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
+static int nfs4_proc_set_acl(struct inode *inode, const void *buf,
+ size_t buflen, enum nfs4_acl_type type)
{
struct nfs4_exception exception = { };
int err;
do {
- err = __nfs4_proc_set_acl(inode, buf, buflen);
+ err = __nfs4_proc_set_acl(inode, buf, buflen, type);
trace_nfs4_set_acl(inode, err);
if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) {
/*
@@ -6553,7 +6594,9 @@ static void nfs4_delegreturn_release(void *calldata)
pnfs_roc_release(&data->lr.arg, &data->lr.res,
data->res.lr_ret);
if (inode) {
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
@@ -6602,10 +6645,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs4_delegreturn_ops,
- .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
};
int status = 0;
+ if (nfs_server_capable(inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return -ENOMEM;
@@ -6919,10 +6965,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
.workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
};
- struct nfs_client *client =
- NFS_SERVER(lsp->ls_state->inode)->nfs_client;
- if (client->cl_minorversion)
+ if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client,
@@ -7193,9 +7237,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int ret;
- struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client;
- if (client->cl_minorversion)
+ if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
task_setup_data.flags |= RPC_TASK_MOVEABLE;
data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
@@ -7645,21 +7688,70 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
const char *key, const void *buf,
size_t buflen, int flags)
{
- return nfs4_proc_set_acl(inode, buf, buflen);
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL);
}
static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *key, void *buf, size_t buflen)
{
- return nfs4_proc_get_acl(inode, buf, buflen);
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL);
}
static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
{
- return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL);
+}
+
+#if defined(CONFIG_NFS_V4_1)
+#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl"
+
+static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL);
+}
+
+static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL);
+}
+
+static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL);
+}
+
+#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl"
+
+static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *unused, struct inode *inode,
+ const char *key, const void *buf,
+ size_t buflen, int flags)
+{
+ return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL);
+}
+
+static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *key, void *buf, size_t buflen)
+{
+ return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL);
}
+static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry)
+{
+ return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL);
+}
+
+#endif
+
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
@@ -7892,7 +7984,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
else
bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
- nfs_fattr_init(&fs_locations->fattr);
+ nfs_fattr_init(fs_locations->fattr);
fs_locations->server = server;
fs_locations->nlocations = 0;
status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0);
@@ -7957,7 +8049,7 @@ static int _nfs40_proc_get_locations(struct nfs_server *server,
unsigned long now = jiffies;
int status;
- nfs_fattr_init(&locations->fattr);
+ nfs_fattr_init(locations->fattr);
locations->server = server;
locations->nlocations = 0;
@@ -8022,7 +8114,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server,
};
int status;
- nfs_fattr_init(&locations->fattr);
+ nfs_fattr_init(locations->fattr);
locations->server = server;
locations->nlocations = 0;
@@ -8831,6 +8923,9 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
if (status == 0)
rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
+ else if (rpc_clnt_xprt_switch_has_addr(clnt,
+ (struct sockaddr *)&xprt->addr))
+ rpc_clnt_xprt_switch_remove_xprt(clnt, xprt);
rpc_put_task(task);
}
@@ -9155,6 +9250,13 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
int status;
unsigned *ptr;
struct nfs4_session *session = clp->cl_session;
+ struct nfs4_add_xprt_data xprtdata = {
+ .clp = clp,
+ };
+ struct rpc_add_xprt_test rpcdata = {
+ .add_xprt_test = clp->cl_mvops->session_trunk,
+ .data = &xprtdata,
+ };
dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
@@ -9171,6 +9273,7 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred)
ptr = (unsigned *)&session->sess_id.data[0];
dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__,
clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]);
+ rpc_clnt_probe_trunked_xprts(clp->cl_rpcclient, &rpcdata);
out:
return status;
}
@@ -9200,6 +9303,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
if (status)
dprintk("NFS: Got error %d from the server on DESTROY_SESSION. "
"Session has been destroyed regardless...\n", status);
+ rpc_clnt_manage_trunked_xprts(session->clp->cl_rpcclient);
return status;
}
@@ -9384,6 +9488,9 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
rpc_delay(task, NFS4_POLL_RETRY_MAX);
fallthrough;
case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -EACCES:
+ dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n",
+ __func__, task->tk_status, clp->cl_hostname);
return -EAGAIN;
case -NFS4ERR_BADSESSION:
case -NFS4ERR_DEADSESSION:
@@ -9615,6 +9722,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0);
task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task))
+ return ERR_CAST(task);
status = rpc_wait_for_completion_task(task);
if (status != 0)
@@ -10379,7 +10488,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1
- | NFS_CAP_LGOPEN,
+ | NFS_CAP_LGOPEN
+ | NFS_CAP_MOVEABLE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -10414,7 +10524,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_LAYOUTSTATS
| NFS_CAP_CLONE
| NFS_CAP_LAYOUTERROR
- | NFS_CAP_READ_PLUS,
+ | NFS_CAP_READ_PLUS
+ | NFS_CAP_MOVEABLE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -10575,6 +10686,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
.set = nfs4_xattr_set_nfs4_acl,
};
+#if defined(CONFIG_NFS_V4_1)
+static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = {
+ .name = XATTR_NAME_NFSV4_DACL,
+ .list = nfs4_xattr_list_nfs4_dacl,
+ .get = nfs4_xattr_get_nfs4_dacl,
+ .set = nfs4_xattr_set_nfs4_dacl,
+};
+
+static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = {
+ .name = XATTR_NAME_NFSV4_SACL,
+ .list = nfs4_xattr_list_nfs4_sacl,
+ .get = nfs4_xattr_get_nfs4_sacl,
+ .set = nfs4_xattr_set_nfs4_sacl,
+};
+#endif
+
#ifdef CONFIG_NFS_V4_2
static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
.prefix = XATTR_USER_PREFIX,
@@ -10585,6 +10712,10 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = {
const struct xattr_handler *nfs4_xattr_handlers[] = {
&nfs4_xattr_nfs4_acl_handler,
+#if defined(CONFIG_NFS_V4_1)
+ &nfs4_xattr_nfs4_dacl_handler,
+ &nfs4_xattr_nfs4_sacl_handler,
+#endif
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
&nfs4_xattr_nfs4_label_handler,
#endif
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9e1c987c81e7..9bab3e9c702a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1602,7 +1602,8 @@ static inline void nfs42_complete_copies(struct nfs4_state_owner *sp,
#endif /* CONFIG_NFS_V4_2 */
static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state,
- const struct nfs4_state_recovery_ops *ops)
+ const struct nfs4_state_recovery_ops *ops,
+ int *lost_locks)
{
struct nfs4_lock_state *lock;
int status;
@@ -1620,7 +1621,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st
list_for_each_entry(lock, &state->lock_states, ls_locks) {
trace_nfs4_state_lock_reclaim(state, lock);
if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
- pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__);
+ *lost_locks += 1;
}
spin_unlock(&state->state_lock);
}
@@ -1630,7 +1631,9 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st
return status;
}
-static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp,
+ const struct nfs4_state_recovery_ops *ops,
+ int *lost_locks)
{
struct nfs4_state *state;
unsigned int loop = 0;
@@ -1666,7 +1669,7 @@ restart:
#endif /* CONFIG_NFS_V4_2 */
refcount_inc(&state->count);
spin_unlock(&sp->so_lock);
- status = __nfs4_reclaim_open_state(sp, state, ops);
+ status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks);
switch (status) {
default:
@@ -1909,6 +1912,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
struct rb_node *pos;
LIST_HEAD(freeme);
int status = 0;
+ int lost_locks = 0;
restart:
rcu_read_lock();
@@ -1928,8 +1932,11 @@ restart:
spin_unlock(&clp->cl_lock);
rcu_read_unlock();
- status = nfs4_reclaim_open_state(sp, ops);
+ status = nfs4_reclaim_open_state(sp, ops, &lost_locks);
if (status < 0) {
+ if (lost_locks)
+ pr_warn("NFS: %s: lost %d locks\n",
+ clp->cl_hostname, lost_locks);
set_bit(ops->owner_flag_bit, &sp->so_flags);
nfs4_put_state_owner(sp);
status = nfs4_recovery_handle_error(clp, status);
@@ -1943,6 +1950,9 @@ restart:
}
rcu_read_unlock();
nfs4_free_state_owners(&freeme);
+ if (lost_locks)
+ pr_warn("NFS: %s: lost %d locks\n",
+ clp->cl_hostname, lost_locks);
return 0;
}
@@ -2106,6 +2116,11 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
dprintk("<-- %s: no memory\n", __func__);
goto out;
}
+ locations->fattr = nfs_alloc_fattr();
+ if (locations->fattr == NULL) {
+ dprintk("<-- %s: no memory\n", __func__);
+ goto out;
+ }
inode = d_inode(server->super->s_root);
result = nfs4_proc_get_locations(server, NFS_FH(inode), locations,
@@ -2120,7 +2135,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
if (!locations->nlocations)
goto out;
- if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+ if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
dprintk("<-- %s: No fs_locations data, migration skipped\n",
__func__);
goto out;
@@ -2145,6 +2160,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred
out:
if (page != NULL)
__free_page(page);
+ if (locations != NULL)
+ kfree(locations->fattr);
kfree(locations);
if (result) {
pr_err("NFS: migration recovery failed (server %s)\n",
@@ -2736,5 +2753,6 @@ again:
goto again;
nfs_put_client(clp);
+ module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 86a5f6516928..acfe5f4bda48 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1680,19 +1680,35 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);
}
-static void
-encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg,
- struct compound_hdr *hdr)
+static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2])
{
- __be32 *p;
+ switch (type) {
+ default:
+ bitmap[0] = FATTR4_WORD0_ACL;
+ bitmap[1] = 0;
+ break;
+ case NFS4ACL_DACL:
+ bitmap[0] = 0;
+ bitmap[1] = FATTR4_WORD1_DACL;
+ break;
+ case NFS4ACL_SACL:
+ bitmap[0] = 0;
+ bitmap[1] = FATTR4_WORD1_SACL;
+ }
+}
+
+static void encode_setacl(struct xdr_stream *xdr,
+ const struct nfs_setaclargs *arg,
+ struct compound_hdr *hdr)
+{
+ __u32 bitmap[2];
+
+ nfs4_acltype_to_bitmap(arg->acl_type, bitmap);
encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr);
encode_nfs4_stateid(xdr, &zero_stateid);
- p = reserve_space(xdr, 2*4);
- *p++ = cpu_to_be32(1);
- *p = cpu_to_be32(FATTR4_WORD0_ACL);
- p = reserve_space(xdr, 4);
- *p = cpu_to_be32(arg->acl_len);
+ xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap));
+ encode_uint32(xdr, arg->acl_len);
xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
}
@@ -2587,11 +2603,11 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
- const __u32 nfs4_acl_bitmap[1] = {
- [0] = FATTR4_WORD0_ACL,
- };
+ __u32 nfs4_acl_bitmap[2];
uint32_t replen;
+ nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap);
+
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
@@ -5386,7 +5402,7 @@ decode_restorefh(struct xdr_stream *xdr)
}
static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
- struct nfs_getaclres *res)
+ struct nfs_getaclres *res, enum nfs4_acl_type type)
{
unsigned int savep;
uint32_t attrlen,
@@ -5404,26 +5420,39 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
goto out;
- if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
- return -EIO;
- if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
-
- /* The bitmap (xdr len + bitmaps) and the attr xdr len words
- * are stored with the acl data to handle the problem of
- * variable length bitmaps.*/
- res->acl_data_offset = xdr_page_pos(xdr);
- res->acl_len = attrlen;
-
- /* Check for receive buffer overflow */
- if (res->acl_len > xdr_stream_remaining(xdr) ||
- res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
- res->acl_flags |= NFS4_ACL_TRUNC;
- dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
- attrlen, xdr_stream_remaining(xdr));
- }
- } else
- status = -EOPNOTSUPP;
+ switch (type) {
+ default:
+ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
+ return -EIO;
+ if (!(bitmap[0] & FATTR4_WORD0_ACL))
+ return -EOPNOTSUPP;
+ break;
+ case NFS4ACL_DACL:
+ if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_DACL))
+ return -EOPNOTSUPP;
+ break;
+ case NFS4ACL_SACL:
+ if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U)))
+ return -EIO;
+ if (!(bitmap[1] & FATTR4_WORD1_SACL))
+ return -EOPNOTSUPP;
+ }
+ /* The bitmap (xdr len + bitmaps) and the attr xdr len words
+ * are stored with the acl data to handle the problem of
+ * variable length bitmaps.*/
+ res->acl_data_offset = xdr_page_pos(xdr);
+ res->acl_len = attrlen;
+
+ /* Check for receive buffer overflow */
+ if (res->acl_len > xdr_stream_remaining(xdr) ||
+ res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
+ res->acl_flags |= NFS4_ACL_TRUNC;
+ dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
+ attrlen, xdr_stream_remaining(xdr));
+ }
out:
return status;
}
@@ -6486,7 +6515,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_getacl(xdr, rqstp, res);
+ status = decode_getacl(xdr, rqstp, res, res->acl_type);
out:
return status;
@@ -7051,7 +7080,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
if (res->migration) {
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr_generic(xdr,
- &res->fs_locations->fattr,
+ res->fs_locations->fattr,
NULL, res->fs_locations,
res->fs_locations->server);
if (status)
@@ -7064,7 +7093,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
goto out;
xdr_enter_page(xdr, PAGE_SIZE);
status = decode_getfattr_generic(xdr,
- &res->fs_locations->fattr,
+ res->fs_locations->fattr,
NULL, res->fs_locations,
res->fs_locations->server);
}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 012bd7339862..8c6cc58679ff 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1137,7 +1137,7 @@ TRACE_EVENT(nfs_readpage_done,
__field(u32, arg_count)
__field(u32, res_count)
__field(bool, eof)
- __field(int, status)
+ __field(int, error)
),
TP_fast_assign(
@@ -1146,7 +1146,7 @@ TRACE_EVENT(nfs_readpage_done,
const struct nfs_fh *fh = hdr->args.fh ?
hdr->args.fh : &nfsi->fh;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1157,14 +1157,13 @@ TRACE_EVENT(nfs_readpage_done,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u status=%d%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
- __entry->res_count, __entry->status,
- __entry->eof ? " eof" : ""
+ __entry->res_count, __entry->eof ? " eof" : ""
)
);
@@ -1184,7 +1183,7 @@ TRACE_EVENT(nfs_readpage_short,
__field(u32, arg_count)
__field(u32, res_count)
__field(bool, eof)
- __field(int, status)
+ __field(int, error)
),
TP_fast_assign(
@@ -1193,7 +1192,7 @@ TRACE_EVENT(nfs_readpage_short,
const struct nfs_fh *fh = hdr->args.fh ?
hdr->args.fh : &nfsi->fh;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1204,14 +1203,13 @@ TRACE_EVENT(nfs_readpage_short,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u status=%d%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
- __entry->res_count, __entry->status,
- __entry->eof ? " eof" : ""
+ __entry->res_count, __entry->eof ? " eof" : ""
)
);
@@ -1323,7 +1321,7 @@ TRACE_EVENT(nfs_pgio_error,
__field(u32, arg_count)
__field(u32, res_count)
__field(loff_t, pos)
- __field(int, status)
+ __field(int, error)
),
TP_fast_assign(
@@ -1332,7 +1330,7 @@ TRACE_EVENT(nfs_pgio_error,
const struct nfs_fh *fh = hdr->args.fh ?
hdr->args.fh : &nfsi->fh;
- __entry->status = error;
+ __entry->error = error;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1341,12 +1339,12 @@ TRACE_EVENT(nfs_pgio_error,
__entry->fhandle = nfs_fhandle_hash(fh);
),
- TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u pos=%llu status=%d",
+ TP_printk("error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u pos=%llu", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid, __entry->fhandle,
(long long)__entry->offset, __entry->arg_count, __entry->res_count,
- __entry->pos, __entry->status
+ __entry->pos
)
);
@@ -1406,7 +1404,7 @@ TRACE_EVENT(nfs_writeback_done,
__field(loff_t, offset)
__field(u32, arg_count)
__field(u32, res_count)
- __field(int, status)
+ __field(int, error)
__field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1418,7 +1416,7 @@ TRACE_EVENT(nfs_writeback_done,
hdr->args.fh : &nfsi->fh;
const struct nfs_writeverf *verf = hdr->res.verf;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = hdr->args.offset;
__entry->arg_count = hdr->args.count;
__entry->res_count = hdr->res.count;
@@ -1432,14 +1430,14 @@ TRACE_EVENT(nfs_writeback_done,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%u res=%u status=%d stable=%s "
- "verifier=%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u res=%u stable=%s "
+ "verifier=%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset, __entry->arg_count,
- __entry->res_count, __entry->status,
+ __entry->res_count,
show_nfs_stable_how(__entry->stable),
show_nfs4_verifier(__entry->verifier)
)
@@ -1447,44 +1445,50 @@ TRACE_EVENT(nfs_writeback_done,
DECLARE_EVENT_CLASS(nfs_page_error_class,
TP_PROTO(
+ const struct inode *inode,
const struct nfs_page *req,
int error
),
- TP_ARGS(req, error),
+ TP_ARGS(inode, req, error),
TP_STRUCT__entry(
- __field(const void *, req)
- __field(pgoff_t, index)
- __field(unsigned int, offset)
- __field(unsigned int, pgbase)
- __field(unsigned int, bytes)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(loff_t, offset)
+ __field(unsigned int, count)
__field(int, error)
),
TP_fast_assign(
- __entry->req = req;
- __entry->index = req->wb_index;
- __entry->offset = req->wb_offset;
- __entry->pgbase = req->wb_pgbase;
- __entry->bytes = req->wb_bytes;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+ __entry->offset = req_offset(req);
+ __entry->count = req->wb_bytes;
__entry->error = error;
),
TP_printk(
- "req=%p index=%lu offset=%u pgbase=%u bytes=%u error=%d",
- __entry->req, __entry->index, __entry->offset,
- __entry->pgbase, __entry->bytes, __entry->error
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%u", __entry->error,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset,
+ __entry->count
)
);
#define DEFINE_NFS_PAGEERR_EVENT(name) \
DEFINE_EVENT(nfs_page_error_class, name, \
TP_PROTO( \
+ const struct inode *inode, \
const struct nfs_page *req, \
int error \
), \
- TP_ARGS(req, error))
+ TP_ARGS(inode, req, error))
DEFINE_NFS_PAGEERR_EVENT(nfs_write_error);
DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error);
@@ -1541,7 +1545,7 @@ TRACE_EVENT(nfs_commit_done,
__field(u32, fhandle)
__field(u64, fileid)
__field(loff_t, offset)
- __field(int, status)
+ __field(int, error)
__field(unsigned long, stable)
__array(char, verifier, NFS4_VERIFIER_SIZE)
),
@@ -1553,7 +1557,7 @@ TRACE_EVENT(nfs_commit_done,
data->args.fh : &nfsi->fh;
const struct nfs_writeverf *verf = data->res.verf;
- __entry->status = task->tk_status;
+ __entry->error = task->tk_status;
__entry->offset = data->args.offset;
__entry->stable = verf->committed;
memcpy(__entry->verifier,
@@ -1565,17 +1569,83 @@ TRACE_EVENT(nfs_commit_done,
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld status=%d stable=%s verifier=%s",
+ "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld stable=%s verifier=%s", __entry->error,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- (long long)__entry->offset, __entry->status,
+ (long long)__entry->offset,
show_nfs_stable_how(__entry->stable),
show_nfs4_verifier(__entry->verifier)
)
);
+#define nfs_show_direct_req_flags(v) \
+ __print_flags(v, "|", \
+ { NFS_ODIRECT_DO_COMMIT, "DO_COMMIT" }, \
+ { NFS_ODIRECT_RESCHED_WRITES, "RESCHED_WRITES" }, \
+ { NFS_ODIRECT_SHOULD_DIRTY, "SHOULD DIRTY" }, \
+ { NFS_ODIRECT_DONE, "DONE" } )
+
+DECLARE_EVENT_CLASS(nfs_direct_req_class,
+ TP_PROTO(
+ const struct nfs_direct_req *dreq
+ ),
+
+ TP_ARGS(dreq),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, offset)
+ __field(ssize_t, count)
+ __field(ssize_t, bytes_left)
+ __field(ssize_t, error)
+ __field(int, flags)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = dreq->inode;
+ const struct nfs_inode *nfsi = NFS_I(inode);
+ const struct nfs_fh *fh = &nfsi->fh;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = nfsi->fileid;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->offset = dreq->io_start;
+ __entry->count = dreq->count;
+ __entry->bytes_left = dreq->bytes_left;
+ __entry->error = dreq->error;
+ __entry->flags = dreq->flags;
+ ),
+
+ TP_printk(
+ "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "offset=%lld count=%zd bytes_left=%zd flags=%s",
+ __entry->error, MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle, __entry->offset,
+ __entry->count, __entry->bytes_left,
+ nfs_show_direct_req_flags(__entry->flags)
+ )
+);
+
+#define DEFINE_NFS_DIRECT_REQ_EVENT(name) \
+ DEFINE_EVENT(nfs_direct_req_class, name, \
+ TP_PROTO( \
+ const struct nfs_direct_req *dreq \
+ ), \
+ TP_ARGS(dreq))
+
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_commit_complete);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_resched_write);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_complete);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
+DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
+
TRACE_EVENT(nfs_fh_to_dentry,
TP_PROTO(
const struct super_block *sb,
@@ -1609,6 +1679,65 @@ TRACE_EVENT(nfs_fh_to_dentry,
)
);
+TRACE_EVENT(nfs_mount_assign,
+ TP_PROTO(
+ const char *option,
+ const char *value
+ ),
+
+ TP_ARGS(option, value),
+
+ TP_STRUCT__entry(
+ __string(option, option)
+ __string(value, value)
+ ),
+
+ TP_fast_assign(
+ __assign_str(option, option);
+ __assign_str(value, value);
+ ),
+
+ TP_printk("option %s=%s",
+ __get_str(option), __get_str(value)
+ )
+);
+
+TRACE_EVENT(nfs_mount_option,
+ TP_PROTO(
+ const struct fs_parameter *param
+ ),
+
+ TP_ARGS(param),
+
+ TP_STRUCT__entry(
+ __string(option, param->key)
+ ),
+
+ TP_fast_assign(
+ __assign_str(option, param->key);
+ ),
+
+ TP_printk("option %s", __get_str(option))
+);
+
+TRACE_EVENT(nfs_mount_path,
+ TP_PROTO(
+ const char *path
+ ),
+
+ TP_ARGS(path),
+
+ TP_STRUCT__entry(
+ __string(path, path)
+ ),
+
+ TP_fast_assign(
+ __assign_str(path, path);
+ ),
+
+ TP_printk("path='%s'", __get_str(path))
+);
+
DECLARE_EVENT_CLASS(nfs_xdr_event,
TP_PROTO(
const struct xdr_stream *xdr,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9157dd19b8b4..317cedfa52bf 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -767,6 +767,9 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
.flags = RPC_TASK_ASYNC | flags,
};
+ if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
dprintk("NFS: initiated pgio call "
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 856c962273c7..2613b7e36eb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -469,6 +469,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
pnfs_clear_lseg_state(lseg, lseg_list);
pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
pnfs_clear_layoutreturn_waitbit(lo);
@@ -1917,8 +1918,9 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
{
- if (atomic_dec_and_test(&lo->plh_outstanding))
- wake_up_var(&lo->plh_outstanding);
+ if (atomic_dec_and_test(&lo->plh_outstanding) &&
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
}
static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
@@ -2000,6 +2002,7 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(-ENOMEM);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
@@ -2024,11 +2027,11 @@ lookup_again:
* If the layout segment list is empty, but there are outstanding
* layoutget calls, then they might be subject to a layoutrecall.
*/
- if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
- lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
- !atomic_read(&lo->plh_outstanding)));
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
+ TASK_KILLABLE));
if (IS_ERR(lseg))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
@@ -2128,6 +2131,7 @@ lookup_again:
lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
if (!lgp) {
+ lseg = ERR_PTR(-ENOMEM);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
PNFS_UPDATE_LAYOUT_NOMEM);
nfs_layoutget_end(lo);
@@ -2150,6 +2154,12 @@ lookup_again:
case -ERECALLCONFLICT:
case -EAGAIN:
break;
+ case -ENODATA:
+ /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
+ pnfs_layout_set_fail_bit(
+ lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ goto out_put_layout_hdr;
default:
if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
@@ -2405,7 +2415,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
}
- if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo))
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ !pnfs_is_first_layoutget(lo))
goto out_forget;
if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
@@ -2806,7 +2817,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
/* Resend all requests through the MDS */
nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
hdr->completion_ops);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 07f11489e4e9..f331f067691b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -105,6 +105,7 @@ enum {
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
NFS_LAYOUT_HASHED, /* The layout visible */
+ NFS_LAYOUT_DRAIN,
};
enum layoutdriver_policy_flags {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 5e7657374bc3..8ae2c8d1219d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -120,12 +120,8 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
SetPageError(page);
if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- struct address_space *mapping = page_file_mapping(page);
-
if (PageUptodate(page))
nfs_fscache_write_page(inode, page);
- else if (!PageError(page) && !PagePrivate(page))
- generic_error_remove_page(mapping, page);
unlock_page(page);
}
nfs_release_request(req);
@@ -333,8 +329,9 @@ out:
* - The error flag is set for this page. This happens only when a
* previous async read operation failed.
*/
-int nfs_readpage(struct file *file, struct page *page)
+int nfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct nfs_readdesc desc;
struct inode *inode = page_file_mapping(page)->host;
int ret;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6ab5eeb000dc..ee66ffdb985e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -149,7 +149,7 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
- ret = register_shrinker(&acl_shrinker);
+ ret = register_shrinker(&acl_shrinker, "nfs-acl");
if (ret < 0)
goto error_3;
#ifdef CONFIG_NFS_V4_2
@@ -1051,22 +1051,31 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
if (ctx->bsize)
sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits);
- if (server->nfs_client->rpc_ops->version != 2) {
- /* The VFS shouldn't apply the umask to mode bits. We will do
- * so ourselves when necessary.
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ sb->s_time_gran = 1000;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ break;
+ case 3:
+ /*
+ * The VFS shouldn't apply the umask to mode bits.
+ * We will do so ourselves when necessary.
*/
sb->s_flags |= SB_POSIXACL;
sb->s_time_gran = 1;
- sb->s_export_op = &nfs_export_ops;
- } else
- sb->s_time_gran = 1000;
-
- if (server->nfs_client->rpc_ops->version != 4) {
sb->s_time_min = 0;
sb->s_time_max = U32_MAX;
- } else {
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ case 4:
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ sb->s_export_op = &nfs_export_ops;
+ break;
}
sb->s_magic = NFS_SUPER_MAGIC;
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 25ba299fdac2..0e27a2e4e68b 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -26,21 +26,21 @@
* and straight-forward than readdir caching.
*/
-static int nfs_symlink_filler(void *data, struct page *page)
+static int nfs_symlink_filler(struct file *file, struct folio *folio)
{
- struct inode *inode = data;
+ struct inode *inode = folio->mapping->host;
int error;
- error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE);
+ error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE);
if (error < 0)
goto error;
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
error:
- SetPageError(page);
- unlock_page(page);
+ folio_set_error(folio);
+ folio_unlock(folio);
return -EIO;
}
@@ -67,7 +67,7 @@ static const char *nfs_get_link(struct dentry *dentry,
if (err)
return err;
page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
- inode);
+ NULL);
if (IS_ERR(page))
return ERR_CAST(page);
}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 5fa11e1aca4c..9697cd5d2561 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -102,6 +102,10 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
};
struct rpc_task *task;
struct inode *dir = d_inode(data->dentry->d_parent);
+
+ if (nfs_server_capable(inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
nfs_sb_active(dir->i_sb);
data->args.fh = NFS_FH(dir);
nfs_fattr_init(data->res.dir_attr);
@@ -344,9 +348,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
.flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
+ if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) &&
+ nfs_server_capable(new_dir, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data == NULL)
return ERR_PTR(-ENOMEM);
+ task_setup_data.task = &data->task;
task_setup_data.callback_data = data;
data->cred = get_current_cred();
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f00d45cf80ef..f41d24b54fd1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -592,7 +592,8 @@ nfs_lock_and_join_requests(struct page *page)
static void nfs_write_error(struct nfs_page *req, int error)
{
- trace_nfs_write_error(req, error);
+ trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req,
+ error);
nfs_mapping_set_error(req->wb_page, error);
nfs_inode_remove_request(req);
nfs_end_page_writeback(req);
@@ -603,8 +604,9 @@ static void nfs_write_error(struct nfs_page *req, int error)
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
-static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page)
+static int nfs_page_async_flush(struct page *page,
+ struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio)
{
struct nfs_page *req;
int ret = 0;
@@ -630,11 +632,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
/*
* Remove the problematic req upon fatal errors on the server
*/
- if (nfs_error_is_fatal(ret)) {
- if (nfs_error_is_fatal_on_server(ret))
- goto out_launder;
- } else
- ret = -EAGAIN;
+ if (nfs_error_is_fatal_on_server(ret))
+ goto out_launder;
+ if (wbc->sync_mode == WB_SYNC_NONE)
+ ret = AOP_WRITEPAGE_ACTIVATE;
+ redirty_page_for_writepage(wbc, page);
nfs_redirty_request(req);
pgio->pg_error = 0;
} else
@@ -650,15 +652,8 @@ out_launder:
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
struct nfs_pageio_descriptor *pgio)
{
- int ret;
-
nfs_pageio_cond_complete(pgio, page_index(page));
- ret = nfs_page_async_flush(pgio, page);
- if (ret == -EAGAIN) {
- redirty_page_for_writepage(wbc, page);
- ret = AOP_WRITEPAGE_ACTIVATE;
- }
- return ret;
+ return nfs_page_async_flush(page, wbc, pgio);
}
/*
@@ -681,11 +676,7 @@ static int nfs_writepage_locked(struct page *page,
err = nfs_do_writepage(page, wbc, &pgio);
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
- if (err < 0)
- return err;
- if (nfs_error_is_fatal(pgio.pg_error))
- return pgio.pg_error;
- return 0;
+ return err;
}
int nfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -737,19 +728,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
priority = wb_priority(wbc);
}
- nfs_pageio_init_write(&pgio, inode, priority, false,
- &nfs_async_write_completion_ops);
- pgio.pg_io_completion = ioc;
- err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
- pgio.pg_error = 0;
- nfs_pageio_complete(&pgio);
+ do {
+ nfs_pageio_init_write(&pgio, inode, priority, false,
+ &nfs_async_write_completion_ops);
+ pgio.pg_io_completion = ioc;
+ err = write_cache_pages(mapping, wbc, nfs_writepages_callback,
+ &pgio);
+ pgio.pg_error = 0;
+ nfs_pageio_complete(&pgio);
+ } while (err < 0 && !nfs_error_is_fatal(err));
nfs_io_completion_put(ioc);
if (err < 0)
goto out_err;
- err = pgio.pg_error;
- if (nfs_error_is_fatal(err))
- goto out_err;
return 0;
out_err:
return err;
@@ -1010,7 +1001,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
(hdr->good_bytes < bytes)) {
- trace_nfs_comp_error(req, hdr->error);
+ trace_nfs_comp_error(hdr->inode, req, hdr->error);
nfs_mapping_set_error(req->wb_page, hdr->error);
goto remove_req;
}
@@ -1429,10 +1420,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
*/
static void nfs_redirty_request(struct nfs_page *req)
{
+ struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
+
/* Bump the transmission count */
req->wb_nio++;
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&nfsi->redirtied_pages);
nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1444,7 +1437,7 @@ static void nfs_async_write_error(struct list_head *head, int error)
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
- if (nfs_error_is_fatal(error))
+ if (nfs_error_is_fatal_on_server(error))
nfs_write_error(req, error);
else
nfs_redirty_request(req);
@@ -1454,8 +1447,6 @@ static void nfs_async_write_error(struct list_head *head, int error)
static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
{
nfs_async_write_error(&hdr->pages, 0);
- filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset,
- hdr->args.offset + hdr->args.count - 1);
}
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
@@ -1505,31 +1496,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
-/*
- * Special version of should_remove_suid() that ignores capabilities.
- */
-static int nfs_should_remove_suid(const struct inode *inode)
-{
- umode_t mode = inode->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-
static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
struct nfs_fattr *fattr)
{
@@ -1586,25 +1552,37 @@ static int nfs_writeback_done(struct rpc_task *task,
nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
trace_nfs_writeback_done(task, hdr);
- if (hdr->res.verf->committed < hdr->args.stable &&
- task->tk_status >= 0) {
- /* We tried a write call, but the server did not
- * commit data to stable storage even though we
- * requested it.
- * Note: There is a known bug in Tru64 < 5.0 in which
- * the server reports NFS_DATA_SYNC, but performs
- * NFS_FILE_SYNC. We therefore implement this checking
- * as a dprintk() in order to avoid filling syslog.
- */
- static unsigned long complain;
+ if (task->tk_status >= 0) {
+ enum nfs3_stable_how committed = hdr->res.verf->committed;
- /* Note this will print the MDS for a DS write */
- if (time_before(complain, jiffies)) {
- dprintk("NFS: faulty NFS server %s:"
- " (committed = %d) != (stable = %d)\n",
- NFS_SERVER(inode)->nfs_client->cl_hostname,
- hdr->res.verf->committed, hdr->args.stable);
- complain = jiffies + 300 * HZ;
+ if (committed == NFS_UNSTABLE) {
+ /*
+ * We have some uncommitted data on the server at
+ * this point, so ensure that we keep track of that
+ * fact irrespective of what later writes do.
+ */
+ set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags);
+ }
+
+ if (committed < hdr->args.stable) {
+ /* We tried a write call, but the server did not
+ * commit data to stable storage even though we
+ * requested it.
+ * Note: There is a known bug in Tru64 < 5.0 in which
+ * the server reports NFS_DATA_SYNC, but performs
+ * NFS_FILE_SYNC. We therefore implement this checking
+ * as a dprintk() in order to avoid filling syslog.
+ */
+ static unsigned long complain;
+
+ /* Note this will print the MDS for a DS write */
+ if (time_before(complain, jiffies)) {
+ dprintk("NFS: faulty NFS server %s:"
+ " (committed = %d) != (stable = %d)\n",
+ NFS_SERVER(inode)->nfs_client->cl_hostname,
+ committed, hdr->args.stable);
+ complain = jiffies + 300 * HZ;
+ }
}
}
@@ -1719,6 +1697,10 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
.flags = RPC_TASK_ASYNC | flags,
.priority = priority,
};
+
+ if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE))
+ task_setup_data.flags |= RPC_TASK_MOVEABLE;
+
/* Set up the initial task struct. */
nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client);
trace_nfs_initiate_commit(data);
@@ -1878,7 +1860,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
(long long)req_offset(req));
if (status < 0) {
if (req->wb_page) {
- trace_nfs_commit_error(req, status);
+ trace_nfs_commit_error(data->inode, req,
+ status);
nfs_mapping_set_error(req->wb_page, status);
nfs_inode_remove_request(req);
}
@@ -1898,7 +1881,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* We have a mismatch. Write the page again */
dprintk_cont(" mismatch\n");
nfs_mark_request_dirty(req);
- set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
+ atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
next:
nfs_unlock_and_release_request(req);
/* Latency breaker */
@@ -2125,27 +2108,27 @@ out_error:
}
#ifdef CONFIG_MIGRATION
-int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
+int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
{
/*
- * If PagePrivate is set, then the page is currently associated with
+ * If the private flag is set, the folio is currently associated with
* an in-progress read or write request. Don't try to migrate it.
*
* FIXME: we could do this in principle, but we'll need a way to ensure
* that we can safely release the inode reference while holding
- * the page lock.
+ * the folio lock.
*/
- if (PagePrivate(page))
+ if (folio_test_private(src))
return -EBUSY;
- if (PageFsCache(page)) {
+ if (folio_test_fscache(src)) {
if (mode == MIGRATE_ASYNC)
return -EBUSY;
- wait_on_page_fscache(page);
+ folio_wait_fscache(src);
}
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
#endif
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index ba14d2f4b64f..4b7324458a94 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -38,6 +38,8 @@
struct nfs4_acl;
struct svc_fh;
struct svc_rqst;
+struct nfsd_attrs;
+enum nfs_ftype4;
int nfs4_acl_bytes(int entries);
int nfs4_acl_get_whotype(char *, u32);
@@ -45,7 +47,7 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
struct nfs4_acl **acl);
-__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfs4_acl *acl);
+__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl,
+ struct nfsd_attrs *attr);
#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 65c331f75e9c..f21259ead64b 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,6 +84,6 @@ int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
int nfsd_cache_lookup(struct svc_rqst *);
void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
-int nfsd_reply_cache_stats_open(struct inode *, struct file *);
+int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index c08882f5867b..d5c57360b418 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -13,6 +13,7 @@
#include <linux/fsnotify_backend.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
+#include <linux/rhashtable.h>
#include "vfs.h"
#include "nfsd.h"
@@ -21,28 +22,19 @@
#include "filecache.h"
#include "trace.h"
-#define NFSDDBG_FACILITY NFSDDBG_FH
-
-/* FIXME: dynamically size this for the machine somehow? */
-#define NFSD_FILE_HASH_BITS 12
-#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS)
#define NFSD_LAUNDRETTE_DELAY (2 * HZ)
-#define NFSD_FILE_SHUTDOWN (1)
-#define NFSD_FILE_LRU_THRESHOLD (4096UL)
-#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2)
+#define NFSD_FILE_CACHE_UP (0)
/* We only care about NFSD_MAY_READ/WRITE for this cache */
#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
-struct nfsd_fcache_bucket {
- struct hlist_head nfb_head;
- spinlock_t nfb_lock;
- unsigned int nfb_count;
- unsigned int nfb_maxcount;
-};
-
static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
struct nfsd_fcache_disposal {
struct work_struct work;
@@ -54,21 +46,146 @@ static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
static struct kmem_cache *nfsd_file_slab;
static struct kmem_cache *nfsd_file_mark_slab;
-static struct nfsd_fcache_bucket *nfsd_file_hashtbl;
static struct list_lru nfsd_file_lru;
-static long nfsd_file_lru_flags;
+static unsigned long nfsd_file_flags;
static struct fsnotify_group *nfsd_file_fsnotify_group;
-static atomic_long_t nfsd_filecache_count;
static struct delayed_work nfsd_filecache_laundrette;
+static struct rhashtable nfsd_file_rhash_tbl
+ ____cacheline_aligned_in_smp;
+
+enum nfsd_file_lookup_type {
+ NFSD_FILE_KEY_INODE,
+ NFSD_FILE_KEY_FULL,
+};
+
+struct nfsd_file_lookup_key {
+ struct inode *inode;
+ struct net *net;
+ const struct cred *cred;
+ unsigned char need;
+ enum nfsd_file_lookup_type type;
+};
+
+/*
+ * The returned hash value is based solely on the address of an in-code
+ * inode, a pointer to a slab-allocated object. The entropy in such a
+ * pointer is concentrated in its middle bits.
+ */
+static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed)
+{
+ unsigned long ptr = (unsigned long)inode;
+ u32 k;
+
+ k = ptr >> L1_CACHE_SHIFT;
+ k &= 0x00ffffff;
+ return jhash2(&k, 1, seed);
+}
-static void nfsd_file_gc(void);
+/**
+ * nfsd_file_key_hashfn - Compute the hash value of a lookup key
+ * @data: key on which to compute the hash value
+ * @len: rhash table's key_len parameter (unused)
+ * @seed: rhash table's random seed of the day
+ *
+ * Return value:
+ * Computed 32-bit hash value
+ */
+static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct nfsd_file_lookup_key *key = data;
+
+ return nfsd_file_inode_hash(key->inode, seed);
+}
+
+/**
+ * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file
+ * @data: object on which to compute the hash value
+ * @len: rhash table's key_len parameter (unused)
+ * @seed: rhash table's random seed of the day
+ *
+ * Return value:
+ * Computed 32-bit hash value
+ */
+static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct nfsd_file *nf = data;
+
+ return nfsd_file_inode_hash(nf->nf_inode, seed);
+}
+
+static bool
+nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+{
+ int i;
+
+ if (!uid_eq(c1->fsuid, c2->fsuid))
+ return false;
+ if (!gid_eq(c1->fsgid, c2->fsgid))
+ return false;
+ if (c1->group_info == NULL || c2->group_info == NULL)
+ return c1->group_info == c2->group_info;
+ if (c1->group_info->ngroups != c2->group_info->ngroups)
+ return false;
+ for (i = 0; i < c1->group_info->ngroups; i++) {
+ if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+ return false;
+ }
+ return true;
+}
+
+/**
+ * nfsd_file_obj_cmpfn - Match a cache item against search criteria
+ * @arg: search criteria
+ * @ptr: cache item to check
+ *
+ * Return values:
+ * %0 - Item matches search criteria
+ * %1 - Item does not match search criteria
+ */
+static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct nfsd_file_lookup_key *key = arg->key;
+ const struct nfsd_file *nf = ptr;
+
+ switch (key->type) {
+ case NFSD_FILE_KEY_INODE:
+ if (nf->nf_inode != key->inode)
+ return 1;
+ break;
+ case NFSD_FILE_KEY_FULL:
+ if (nf->nf_inode != key->inode)
+ return 1;
+ if (nf->nf_may != key->need)
+ return 1;
+ if (nf->nf_net != key->net)
+ return 1;
+ if (!nfsd_match_cred(nf->nf_cred, key->cred))
+ return 1;
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+static const struct rhashtable_params nfsd_file_rhash_params = {
+ .key_len = sizeof_field(struct nfsd_file, nf_inode),
+ .key_offset = offsetof(struct nfsd_file, nf_inode),
+ .head_offset = offsetof(struct nfsd_file, nf_rhash),
+ .hashfn = nfsd_file_key_hashfn,
+ .obj_hashfn = nfsd_file_obj_hashfn,
+ .obj_cmpfn = nfsd_file_obj_cmpfn,
+ /* Reduce resizing churn on light workloads */
+ .min_size = 512, /* buckets */
+ .automatic_shrinking = true,
+};
static void
nfsd_file_schedule_laundrette(void)
{
- long count = atomic_long_read(&nfsd_filecache_count);
-
- if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
+ if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) ||
+ test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
return;
queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
@@ -111,22 +228,21 @@ nfsd_file_mark_put(struct nfsd_file_mark *nfm)
}
static struct nfsd_file_mark *
-nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
{
int err;
struct fsnotify_mark *mark;
struct nfsd_file_mark *nfm = NULL, *new;
- struct inode *inode = nf->nf_inode;
do {
- mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_group_lock(nfsd_file_fsnotify_group);
mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
- nfsd_file_fsnotify_group);
+ nfsd_file_fsnotify_group);
if (mark) {
nfm = nfsd_file_mark_get(container_of(mark,
struct nfsd_file_mark,
nfm_mark));
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
if (nfm) {
fsnotify_put_mark(mark);
break;
@@ -134,8 +250,9 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
/* Avoid soft lockup race with nfsd_file_mark_put() */
fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group);
fsnotify_put_mark(mark);
- } else
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ } else {
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
+ }
/* allocate a new nfm */
new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
@@ -166,31 +283,25 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
}
static struct nfsd_file *
-nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
- struct net *net)
+nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
{
struct nfsd_file *nf;
nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
if (nf) {
- INIT_HLIST_NODE(&nf->nf_node);
INIT_LIST_HEAD(&nf->nf_lru);
+ nf->nf_birthtime = ktime_get();
nf->nf_file = NULL;
nf->nf_cred = get_current_cred();
- nf->nf_net = net;
+ nf->nf_net = key->net;
nf->nf_flags = 0;
- nf->nf_inode = inode;
- nf->nf_hashval = hashval;
- refcount_set(&nf->nf_ref, 1);
- nf->nf_may = may & NFSD_FILE_MAY_MASK;
- if (may & NFSD_MAY_NOT_BREAK_LEASE) {
- if (may & NFSD_MAY_WRITE)
- __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
- if (may & NFSD_MAY_READ)
- __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
- }
+ __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
+ __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ nf->nf_inode = key->inode;
+ /* nf_ref is pre-incremented for hash table */
+ refcount_set(&nf->nf_ref, 2);
+ nf->nf_may = key->need;
nf->nf_mark = NULL;
- trace_nfsd_file_alloc(nf);
}
return nf;
}
@@ -198,8 +309,12 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
static bool
nfsd_file_free(struct nfsd_file *nf)
{
+ s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
bool flush = false;
+ this_cpu_inc(nfsd_file_releases);
+ this_cpu_add(nfsd_file_total_age, age);
+
trace_nfsd_file_put_final(nf);
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
@@ -209,6 +324,14 @@ nfsd_file_free(struct nfsd_file *nf)
fput(nf->nf_file);
flush = true;
}
+
+ /*
+ * If this item is still linked via nf_lru, that's a bug.
+ * WARN and leak it to preserve system stability.
+ */
+ if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
+ return flush;
+
call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
return flush;
}
@@ -237,26 +360,46 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
}
static void
-nfsd_file_do_unhash(struct nfsd_file *nf)
+nfsd_file_flush(struct nfsd_file *nf)
+{
+ struct file *file = nf->nf_file;
+
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return;
+ this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
+ if (vfs_fsync(file, 1) != 0)
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+}
+
+static void nfsd_file_lru_add(struct nfsd_file *nf)
+{
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
+ trace_nfsd_file_lru_add(nf);
+}
+
+static void nfsd_file_lru_remove(struct nfsd_file *nf)
{
- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
+ trace_nfsd_file_lru_del(nf);
+}
+static void
+nfsd_file_hash_remove(struct nfsd_file *nf)
+{
trace_nfsd_file_unhash(nf);
if (nfsd_file_check_write_error(nf))
nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
- --nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
- hlist_del_rcu(&nf->nf_node);
- atomic_long_dec(&nfsd_filecache_count);
+ rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
+ nfsd_file_rhash_params);
}
static bool
nfsd_file_unhash(struct nfsd_file *nf)
{
if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_do_unhash(nf);
- if (!list_empty(&nf->nf_lru))
- list_lru_del(&nfsd_file_lru, &nf->nf_lru);
+ nfsd_file_hash_remove(nf);
return true;
}
return false;
@@ -266,17 +409,16 @@ nfsd_file_unhash(struct nfsd_file *nf)
* Return true if the file was unhashed.
*/
static bool
-nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
+nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
{
- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
-
- trace_nfsd_file_unhash_and_release_locked(nf);
+ trace_nfsd_file_unhash_and_dispose(nf);
if (!nfsd_file_unhash(nf))
return false;
/* keep final reference for nfsd_file_lru_dispose */
if (refcount_dec_not_one(&nf->nf_ref))
return true;
+ nfsd_file_lru_remove(nf);
list_add(&nf->nf_lru, dispose);
return true;
}
@@ -288,6 +430,7 @@ nfsd_file_put_noref(struct nfsd_file *nf)
if (refcount_dec_and_test(&nf->nf_ref)) {
WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+ nfsd_file_lru_remove(nf);
nfsd_file_free(nf);
}
}
@@ -295,21 +438,35 @@ nfsd_file_put_noref(struct nfsd_file *nf)
void
nfsd_file_put(struct nfsd_file *nf)
{
- bool is_hashed;
+ might_sleep();
- set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) {
+ nfsd_file_lru_add(nf);
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
+ nfsd_file_flush(nf);
+ nfsd_file_put_noref(nf);
+ } else if (nf->nf_file) {
nfsd_file_put_noref(nf);
- return;
- }
-
- filemap_flush(nf->nf_file->f_mapping);
- is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
- nfsd_file_put_noref(nf);
- if (is_hashed)
nfsd_file_schedule_laundrette();
- if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
- nfsd_file_gc();
+ } else
+ nfsd_file_put_noref(nf);
+}
+
+/**
+ * nfsd_file_close - Close an nfsd_file
+ * @nf: nfsd_file to close
+ *
+ * If this is the final reference for @nf, free it immediately.
+ * This reflects an on-the-wire CLOSE or DELEGRETURN into the
+ * VFS and exported filesystem.
+ */
+void nfsd_file_close(struct nfsd_file *nf)
+{
+ nfsd_file_put(nf);
+ if (refcount_dec_if_one(&nf->nf_ref)) {
+ nfsd_file_unhash(nf);
+ nfsd_file_lru_remove(nf);
+ nfsd_file_free(nf);
+ }
}
struct nfsd_file *
@@ -327,7 +484,8 @@ nfsd_file_dispose_list(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del(&nf->nf_lru);
+ list_del_init(&nf->nf_lru);
+ nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
}
}
@@ -340,7 +498,8 @@ nfsd_file_dispose_list_sync(struct list_head *dispose)
while(!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del(&nf->nf_lru);
+ list_del_init(&nf->nf_lru);
+ nfsd_file_flush(nf);
if (!refcount_dec_and_test(&nf->nf_ref))
continue;
if (nfsd_file_free(nf))
@@ -396,8 +555,19 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
}
}
-/*
+/**
+ * nfsd_file_lru_cb - Examine an entry on the LRU list
+ * @item: LRU entry to examine
+ * @lru: controlling LRU
+ * @lock: LRU list lock (unused)
+ * @arg: dispose list
+ *
* Note this can deadlock with nfsd_file_cache_purge.
+ *
+ * Return values:
+ * %LRU_REMOVED: @item was removed from the LRU
+ * %LRU_ROTATE: @item is to be moved to the LRU tail
+ * %LRU_SKIP: @item cannot be evicted
*/
static enum lru_status
nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
@@ -418,55 +588,65 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
* counter. Here we check the counter and then test and clear the flag.
* That order is deliberate to ensure that we can do this locklessly.
*/
- if (refcount_read(&nf->nf_ref) > 1)
- goto out_skip;
+ if (refcount_read(&nf->nf_ref) > 1) {
+ list_lru_isolate(lru, &nf->nf_lru);
+ trace_nfsd_file_gc_in_use(nf);
+ return LRU_REMOVED;
+ }
/*
* Don't throw out files that are still undergoing I/O or
* that have uncleared errors pending.
*/
- if (nfsd_file_check_writeback(nf))
- goto out_skip;
+ if (nfsd_file_check_writeback(nf)) {
+ trace_nfsd_file_gc_writeback(nf);
+ return LRU_SKIP;
+ }
- if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
- goto out_skip;
+ if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
+ trace_nfsd_file_gc_referenced(nf);
+ return LRU_ROTATE;
+ }
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
- goto out_skip;
+ if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ trace_nfsd_file_gc_hashed(nf);
+ return LRU_SKIP;
+ }
list_lru_isolate_move(lru, &nf->nf_lru, head);
+ this_cpu_inc(nfsd_file_evictions);
+ trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
-out_skip:
- return LRU_SKIP;
}
-static unsigned long
-nfsd_file_lru_walk_list(struct shrink_control *sc)
+/*
+ * Unhash items on @dispose immediately, then queue them on the
+ * disposal workqueue to finish releasing them in the background.
+ *
+ * cel: Note that between the time list_lru_shrink_walk runs and
+ * now, these items are in the hash table but marked unhashed.
+ * Why release these outside of lru_cb ? There's no lock ordering
+ * problem since lru_cb currently takes no lock.
+ */
+static void nfsd_file_gc_dispose_list(struct list_head *dispose)
{
- LIST_HEAD(head);
struct nfsd_file *nf;
- unsigned long ret;
- if (sc)
- ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
- nfsd_file_lru_cb, &head);
- else
- ret = list_lru_walk(&nfsd_file_lru,
- nfsd_file_lru_cb,
- &head, LONG_MAX);
- list_for_each_entry(nf, &head, nf_lru) {
- spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
- nfsd_file_do_unhash(nf);
- spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
- }
- nfsd_file_dispose_list_delayed(&head);
- return ret;
+ list_for_each_entry(nf, dispose, nf_lru)
+ nfsd_file_hash_remove(nf);
+ nfsd_file_dispose_list_delayed(dispose);
}
static void
nfsd_file_gc(void)
{
- nfsd_file_lru_walk_list(NULL);
+ LIST_HEAD(dispose);
+ unsigned long ret;
+
+ ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
+ &dispose, list_lru_count(&nfsd_file_lru));
+ trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
+ nfsd_file_gc_dispose_list(&dispose);
}
static void
@@ -485,7 +665,14 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
static unsigned long
nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
{
- return nfsd_file_lru_walk_list(sc);
+ LIST_HEAD(dispose);
+ unsigned long ret;
+
+ ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
+ nfsd_file_lru_cb, &dispose);
+ trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
+ nfsd_file_gc_dispose_list(&dispose);
+ return ret;
}
static struct shrinker nfsd_file_shrinker = {
@@ -494,39 +681,47 @@ static struct shrinker nfsd_file_shrinker = {
.seeks = 1,
};
-static void
-__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
- struct list_head *dispose)
+/*
+ * Find all cache items across all net namespaces that match @inode and
+ * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+ */
+static unsigned int
+__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
{
- struct nfsd_file *nf;
- struct hlist_node *tmp;
+ struct nfsd_file_lookup_key key = {
+ .type = NFSD_FILE_KEY_INODE,
+ .inode = inode,
+ };
+ unsigned int count = 0;
+ struct nfsd_file *nf;
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
- if (inode == nf->nf_inode)
- nfsd_file_unhash_and_release_locked(nf, dispose);
- }
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ rcu_read_lock();
+ do {
+ nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params);
+ if (!nf)
+ break;
+ nfsd_file_unhash_and_dispose(nf, dispose);
+ count++;
+ } while (1);
+ rcu_read_unlock();
+ return count;
}
/**
* nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Walk the whole hash bucket, looking for any files that correspond to "inode".
- * If any do, then unhash them and put the hashtable reference to them and
- * destroy any that had their last reference put. Also ensure that any of the
- * fputs also have their final __fput done as well.
+ * Unhash and put, then flush and fput all cache items associated with @inode.
*/
void
nfsd_file_close_inode_sync(struct inode *inode)
{
- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
- NFSD_FILE_HASH_BITS);
LIST_HEAD(dispose);
+ unsigned int count;
- __nfsd_file_close_inode(inode, hashval, &dispose);
- trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
+ count = __nfsd_file_close_inode(inode, &dispose);
+ trace_nfsd_file_close_inode_sync(inode, count);
nfsd_file_dispose_list_sync(&dispose);
}
@@ -534,19 +729,16 @@ nfsd_file_close_inode_sync(struct inode *inode)
* nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Walk the whole hash bucket, looking for any files that correspond to "inode".
- * If any do, then unhash them and put the hashtable reference to them and
- * destroy any that had their last reference put.
+ * Unhash and put all cache item associated with @inode.
*/
static void
nfsd_file_close_inode(struct inode *inode)
{
- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
- NFSD_FILE_HASH_BITS);
LIST_HEAD(dispose);
+ unsigned int count;
- __nfsd_file_close_inode(inode, hashval, &dispose);
- trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+ count = __nfsd_file_close_inode(inode, &dispose);
+ trace_nfsd_file_close_inode(inode, count);
nfsd_file_dispose_list_delayed(&dispose);
}
@@ -621,25 +813,21 @@ static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
int
nfsd_file_cache_init(void)
{
- int ret = -ENOMEM;
- unsigned int i;
+ int ret;
- clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
-
- if (nfsd_file_hashtbl)
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
return 0;
+ ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params);
+ if (ret)
+ return ret;
+
+ ret = -ENOMEM;
nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
if (!nfsd_filecache_wq)
goto out;
- nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
- sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
- if (!nfsd_file_hashtbl) {
- pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
- goto out_err;
- }
-
nfsd_file_slab = kmem_cache_create("nfsd_file",
sizeof(struct nfsd_file), 0, 0, NULL);
if (!nfsd_file_slab) {
@@ -661,7 +849,7 @@ nfsd_file_cache_init(void)
goto out_err;
}
- ret = register_shrinker(&nfsd_file_shrinker);
+ ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache");
if (ret) {
pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
goto out_lru;
@@ -673,7 +861,8 @@ nfsd_file_cache_init(void)
goto out_shrinker;
}
- nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+ nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops,
+ FSNOTIFY_GROUP_NOFS);
if (IS_ERR(nfsd_file_fsnotify_group)) {
pr_err("nfsd: unable to create fsnotify group: %ld\n",
PTR_ERR(nfsd_file_fsnotify_group));
@@ -682,11 +871,6 @@ nfsd_file_cache_init(void)
goto out_notifier;
}
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
- spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
- }
-
INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker);
out:
return ret;
@@ -701,46 +885,47 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kvfree(nfsd_file_hashtbl);
- nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
+ rhashtable_destroy(&nfsd_file_rhash_tbl);
goto out;
}
/*
* Note this can deadlock with nfsd_file_lru_cb.
*/
-void
-nfsd_file_cache_purge(struct net *net)
+static void
+__nfsd_file_cache_purge(struct net *net)
{
- unsigned int i;
- struct nfsd_file *nf;
- struct hlist_node *next;
+ struct rhashtable_iter iter;
+ struct nfsd_file *nf;
LIST_HEAD(dispose);
bool del;
- if (!nfsd_file_hashtbl)
- return;
-
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+ rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter);
+ do {
+ rhashtable_walk_start(&iter);
- spin_lock(&nfb->nfb_lock);
- hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+ nf = rhashtable_walk_next(&iter);
+ while (!IS_ERR_OR_NULL(nf)) {
if (net && nf->nf_net != net)
continue;
- del = nfsd_file_unhash_and_release_locked(nf, &dispose);
+ del = nfsd_file_unhash_and_dispose(nf, &dispose);
/*
* Deadlock detected! Something marked this entry as
* unhased, but hasn't removed it from the hash list.
*/
WARN_ON_ONCE(!del);
+
+ nf = rhashtable_walk_next(&iter);
}
- spin_unlock(&nfb->nfb_lock);
- nfsd_file_dispose_list(&dispose);
- }
+
+ rhashtable_walk_stop(&iter);
+ } while (nf == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+
+ nfsd_file_dispose_list(&dispose);
}
static struct nfsd_fcache_disposal *
@@ -783,6 +968,19 @@ nfsd_file_cache_start_net(struct net *net)
return nn->fcache_disposal ? 0 : -ENOMEM;
}
+/**
+ * nfsd_file_cache_purge - Remove all cache items associated with @net
+ * @net: target net namespace
+ *
+ */
+void
+nfsd_file_cache_purge(struct net *net)
+{
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
+ __nfsd_file_cache_purge(net);
+}
+
void
nfsd_file_cache_shutdown_net(struct net *net)
{
@@ -793,7 +991,11 @@ nfsd_file_cache_shutdown_net(struct net *net)
void
nfsd_file_cache_shutdown(void)
{
- set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+ int i;
+
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_and_clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
+ return;
lease_unregister_notifier(&nfsd_file_lease_notifier);
unregister_shrinker(&nfsd_file_shrinker);
@@ -802,7 +1004,7 @@ nfsd_file_cache_shutdown(void)
* calling nfsd_file_cache_purge
*/
cancel_delayed_work_sync(&nfsd_filecache_laundrette);
- nfsd_file_cache_purge(NULL);
+ __nfsd_file_cache_purge(NULL);
list_lru_destroy(&nfsd_file_lru);
rcu_barrier();
fsnotify_put_group(nfsd_file_fsnotify_group);
@@ -812,124 +1014,96 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kvfree(nfsd_file_hashtbl);
- nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
-}
-
-static bool
-nfsd_match_cred(const struct cred *c1, const struct cred *c2)
-{
- int i;
-
- if (!uid_eq(c1->fsuid, c2->fsuid))
- return false;
- if (!gid_eq(c1->fsgid, c2->fsgid))
- return false;
- if (c1->group_info == NULL || c2->group_info == NULL)
- return c1->group_info == c2->group_info;
- if (c1->group_info->ngroups != c2->group_info->ngroups)
- return false;
- for (i = 0; i < c1->group_info->ngroups; i++) {
- if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
- return false;
+ rhashtable_destroy(&nfsd_file_rhash_tbl);
+
+ for_each_possible_cpu(i) {
+ per_cpu(nfsd_file_cache_hits, i) = 0;
+ per_cpu(nfsd_file_acquisitions, i) = 0;
+ per_cpu(nfsd_file_releases, i) = 0;
+ per_cpu(nfsd_file_total_age, i) = 0;
+ per_cpu(nfsd_file_pages_flushed, i) = 0;
+ per_cpu(nfsd_file_evictions, i) = 0;
}
- return true;
-}
-
-static struct nfsd_file *
-nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
- unsigned int hashval, struct net *net)
-{
- struct nfsd_file *nf;
- unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
-
- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
- nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) {
- if (nf->nf_may != need)
- continue;
- if (nf->nf_inode != inode)
- continue;
- if (nf->nf_net != net)
- continue;
- if (!nfsd_match_cred(nf->nf_cred, current_cred()))
- continue;
- if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
- continue;
- if (nfsd_file_get(nf) != NULL)
- return nf;
- }
- return NULL;
}
/**
- * nfsd_file_is_cached - are there any cached open files for this fh?
- * @inode: inode of the file to check
+ * nfsd_file_is_cached - are there any cached open files for this inode?
+ * @inode: inode to check
*
- * Scan the hashtable for open files that match this fh. Returns true if there
- * are any, and false if not.
+ * The lookup matches inodes in all net namespaces and is atomic wrt
+ * nfsd_file_acquire().
+ *
+ * Return values:
+ * %true: filecache contains at least one file matching this inode
+ * %false: filecache contains no files matching this inode
*/
bool
nfsd_file_is_cached(struct inode *inode)
{
- bool ret = false;
- struct nfsd_file *nf;
- unsigned int hashval;
-
- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
- nf_node) {
- if (inode == nf->nf_inode) {
- ret = true;
- break;
- }
- }
- rcu_read_unlock();
- trace_nfsd_file_is_cached(inode, hashval, (int)ret);
+ struct nfsd_file_lookup_key key = {
+ .type = NFSD_FILE_KEY_INODE,
+ .inode = inode,
+ };
+ bool ret = false;
+
+ if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params) != NULL)
+ ret = true;
+ trace_nfsd_file_is_cached(inode, (int)ret);
return ret;
}
-__be32
-nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf)
+static __be32
+nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf, bool open)
{
- __be32 status;
- struct net *net = SVC_NET(rqstp);
+ struct nfsd_file_lookup_key key = {
+ .type = NFSD_FILE_KEY_FULL,
+ .need = may_flags & NFSD_FILE_MAY_MASK,
+ .net = SVC_NET(rqstp),
+ };
struct nfsd_file *nf, *new;
- struct inode *inode;
- unsigned int hashval;
bool retry = true;
+ __be32 status;
- /* FIXME: skip this if fh_dentry is already set? */
status = fh_verify(rqstp, fhp, S_IFREG,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
if (status != nfs_ok)
return status;
+ key.inode = d_inode(fhp->fh_dentry);
+ key.cred = get_current_cred();
- inode = d_inode(fhp->fh_dentry);
- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
retry:
- rcu_read_lock();
- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
- rcu_read_unlock();
+ /* Avoid allocation if the item is already in cache */
+ nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key,
+ nfsd_file_rhash_params);
+ if (nf)
+ nf = nfsd_file_get(nf);
if (nf)
goto wait_for_construction;
- new = nfsd_file_alloc(inode, may_flags, hashval, net);
+ new = nfsd_file_alloc(&key, may_flags);
if (!new) {
- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
- NULL, nfserr_jukebox);
- return nfserr_jukebox;
+ status = nfserr_jukebox;
+ goto out_status;
}
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
- if (nf == NULL)
+ nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl,
+ &key, &new->nf_rhash,
+ nfsd_file_rhash_params);
+ if (!nf) {
+ nf = new;
goto open_file;
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+ }
+ if (IS_ERR(nf))
+ goto insert_err;
+ nf = nfsd_file_get(nf);
+ if (nf == NULL) {
+ nf = new;
+ goto open_file;
+ }
nfsd_file_slab_free(&new->nf_rcu);
wait_for_construction:
@@ -937,6 +1111,7 @@ wait_for_construction:
/* Did construction of this file fail? */
if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf);
if (!retry) {
status = nfserr_jukebox;
goto out;
@@ -946,70 +1121,90 @@ wait_for_construction:
goto retry;
}
+ nfsd_file_lru_remove(nf);
this_cpu_inc(nfsd_file_cache_hits);
- if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
- bool write = (may_flags & NFSD_MAY_WRITE);
-
- if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
- (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
- status = nfserrno(nfsd_open_break_lease(
- file_inode(nf->nf_file), may_flags));
- if (status == nfs_ok) {
- clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
- if (write)
- clear_bit(NFSD_FILE_BREAK_WRITE,
- &nf->nf_flags);
- }
- }
- }
+ status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
out:
if (status == nfs_ok) {
+ if (open)
+ this_cpu_inc(nfsd_file_acquisitions);
*pnf = nf;
} else {
nfsd_file_put(nf);
nf = NULL;
}
- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
+out_status:
+ put_cred(key.cred);
+ if (open)
+ trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
return status;
+
open_file:
- nf = new;
- /* Take reference for the hashtable */
- refcount_inc(&nf->nf_ref);
- __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
- __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
- list_lru_add(&nfsd_file_lru, &nf->nf_lru);
- hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
- ++nfsd_file_hashtbl[hashval].nfb_count;
- nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
- nfsd_file_hashtbl[hashval].nfb_count);
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD)
- nfsd_file_gc();
-
- nf->nf_mark = nfsd_file_mark_find_or_create(nf);
- if (nf->nf_mark)
- status = nfsd_open_verified(rqstp, fhp, S_IFREG,
- may_flags, &nf->nf_file);
- else
+ trace_nfsd_file_alloc(nf);
+ nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode);
+ if (nf->nf_mark) {
+ if (open) {
+ status = nfsd_open_verified(rqstp, fhp, may_flags,
+ &nf->nf_file);
+ trace_nfsd_file_open(nf, status);
+ } else
+ status = nfs_ok;
+ } else
status = nfserr_jukebox;
/*
* If construction failed, or we raced with a call to unlink()
* then unhash.
*/
- if (status != nfs_ok || inode->i_nlink == 0) {
- bool do_free;
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- do_free = nfsd_file_unhash(nf);
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- if (do_free)
+ if (status != nfs_ok || key.inode->i_nlink == 0)
+ if (nfsd_file_unhash(nf))
nfsd_file_put_noref(nf);
- }
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
goto out;
+
+insert_err:
+ nfsd_file_slab_free(&new->nf_rcu);
+ trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf));
+ nf = NULL;
+ status = nfserr_jukebox;
+ goto out_status;
+}
+
+/**
+ * nfsd_file_acquire - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
+}
+
+/**
+ * nfsd_file_create - Get a struct nfsd_file, do not open
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file just created
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
}
/*
@@ -1017,35 +1212,50 @@ open_file:
* scraping this file for info should test the labels to ensure they're
* getting the correct field.
*/
-static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
{
- unsigned int i, count = 0, longest = 0;
- unsigned long hits = 0;
+ unsigned long releases = 0, pages_flushed = 0, evictions = 0;
+ unsigned long hits = 0, acquisitions = 0;
+ unsigned int i, count = 0, buckets = 0;
+ unsigned long lru = 0, total_age = 0;
- /*
- * No need for spinlocks here since we're not terribly interested in
- * accuracy. We do take the nfsd_mutex simply to ensure that we
- * don't end up racing with server shutdown
- */
+ /* Serialize with server shutdown */
mutex_lock(&nfsd_mutex);
- if (nfsd_file_hashtbl) {
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- count += nfsd_file_hashtbl[i].nfb_count;
- longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
- }
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) {
+ struct bucket_table *tbl;
+ struct rhashtable *ht;
+
+ lru = list_lru_count(&nfsd_file_lru);
+
+ rcu_read_lock();
+ ht = &nfsd_file_rhash_tbl;
+ count = atomic_read(&ht->nelems);
+ tbl = rht_dereference_rcu(ht->tbl, ht);
+ buckets = tbl->size;
+ rcu_read_unlock();
}
mutex_unlock(&nfsd_mutex);
- for_each_possible_cpu(i)
+ for_each_possible_cpu(i) {
hits += per_cpu(nfsd_file_cache_hits, i);
+ acquisitions += per_cpu(nfsd_file_acquisitions, i);
+ releases += per_cpu(nfsd_file_releases, i);
+ total_age += per_cpu(nfsd_file_total_age, i);
+ evictions += per_cpu(nfsd_file_evictions, i);
+ pages_flushed += per_cpu(nfsd_file_pages_flushed, i);
+ }
seq_printf(m, "total entries: %u\n", count);
- seq_printf(m, "longest chain: %u\n", longest);
+ seq_printf(m, "hash buckets: %u\n", buckets);
+ seq_printf(m, "lru entries: %lu\n", lru);
seq_printf(m, "cache hits: %lu\n", hits);
+ seq_printf(m, "acquisitions: %lu\n", acquisitions);
+ seq_printf(m, "releases: %lu\n", releases);
+ seq_printf(m, "evictions: %lu\n", evictions);
+ if (releases)
+ seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
+ else
+ seq_printf(m, "mean age (ms): -\n");
+ seq_printf(m, "pages flushed: %lu\n", pages_flushed);
return 0;
}
-
-int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nfsd_file_cache_stats_show, NULL);
-}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 435ceab27897..357832bac736 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -29,7 +29,7 @@ struct nfsd_file_mark {
* never be dereferenced, only used for comparison.
*/
struct nfsd_file {
- struct hlist_node nf_node;
+ struct rhash_head nf_rhash;
struct list_head nf_lru;
struct rcu_head nf_rcu;
struct file *nf_file;
@@ -37,15 +37,13 @@ struct nfsd_file {
struct net *nf_net;
#define NFSD_FILE_HASHED (0)
#define NFSD_FILE_PENDING (1)
-#define NFSD_FILE_BREAK_READ (2)
-#define NFSD_FILE_BREAK_WRITE (3)
-#define NFSD_FILE_REFERENCED (4)
+#define NFSD_FILE_REFERENCED (2)
unsigned long nf_flags;
- struct inode *nf_inode;
- unsigned int nf_hashval;
+ struct inode *nf_inode; /* don't deref */
refcount_t nf_ref;
unsigned char nf_may;
struct nfsd_file_mark *nf_mark;
+ ktime_t nf_birthtime;
};
int nfsd_file_cache_init(void);
@@ -54,10 +52,13 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
+void nfsd_file_close(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
-int nfsd_file_cache_stats_open(struct inode *, struct file *);
+__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
+int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 1b1a962a1804..8c854ba3285b 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -189,6 +189,13 @@ struct nfsd_net {
struct nfsd_fcache_disposal *fcache_disposal;
siphash_key_t siphash_key;
+
+ atomic_t nfs4_client_count;
+ int nfs4_max_clients;
+
+ atomic_t nfsd_courtesy_clients;
+ struct shrinker nfsd_client_shrinker;
+ struct delayed_work nfsd_shrinker_work;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 367551bddfc6..13e6e6897f6c 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -111,7 +111,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
if (error)
goto out_errno;
- fh_lock(fh);
+ inode_lock(inode);
error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
argp->acl_access);
@@ -122,7 +122,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
if (error)
goto out_drop_lock;
- fh_unlock(fh);
+ inode_unlock(inode);
fh_drop_write(fh);
@@ -136,7 +136,7 @@ out:
return rpc_success;
out_drop_lock:
- fh_unlock(fh);
+ inode_unlock(inode);
fh_drop_write(fh);
out_errno:
resp->status = nfserrno(error);
@@ -249,34 +249,34 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
int w;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
if (dentry == NULL || d_really_is_negative(dentry))
- return 1;
+ return true;
inode = d_inode(dentry);
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
- return 0;
+ return false;
rqstp->rq_res.page_len = w = nfsacl_size(
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
if (!*(rqstp->rq_next_page++))
- return 1;
+ return true;
w -= PAGE_SIZE;
}
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
resp->mask & NFS_ACL, 0))
- return 0;
+ return false;
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
resp->mask & NFS_DFACL, NFS_ACL_DEFAULT))
- return 0;
+ return false;
- return 1;
+ return true;
}
/* ACCESS */
@@ -286,17 +286,17 @@ nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
struct nfsd3_accessres *resp = rqstp->rq_resp;
if (!svcxdr_encode_stat(xdr, resp->status))
- return 0;
+ return false;
switch (resp->status) {
case nfs_ok:
if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
- return 0;
+ return false;
if (xdr_stream_encode_u32(xdr, resp->access) < 0)
- return 0;
+ return false;
break;
}
- return 1;
+ return true;
}
/*
@@ -331,6 +331,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
@@ -342,6 +343,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfsaclsvc_encode_getaclres,
.pc_release = nfsaclsvc_release_getacl,
.pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_argzero = sizeof(struct nfsd3_getaclargs),
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
@@ -353,6 +355,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_argzero = sizeof(struct nfsd3_setaclargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -364,6 +367,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -375,6 +379,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_encode = nfsaclsvc_encode_accessres,
.pc_release = nfsaclsvc_release_access,
.pc_argsize = sizeof(struct nfsd3_accessargs),
+ .pc_argzero = sizeof(struct nfsd3_accessargs),
.pc_ressize = sizeof(struct nfsd3_accessres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 35b2ebda14da..2fb9ee356455 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -101,7 +101,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
if (error)
goto out_errno;
- fh_lock(fh);
+ inode_lock(inode);
error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
argp->acl_access);
@@ -111,7 +111,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
argp->acl_default);
out_drop_lock:
- fh_unlock(fh);
+ inode_unlock(inode);
fh_drop_write(fh);
out_errno:
resp->status = nfserrno(error);
@@ -252,6 +252,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
@@ -263,6 +264,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_encode = nfs3svc_encode_getaclres,
.pc_release = nfs3svc_release_getacl,
.pc_argsize = sizeof(struct nfsd3_getaclargs),
+ .pc_argzero = sizeof(struct nfsd3_getaclargs),
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
@@ -274,6 +276,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_encode = nfs3svc_encode_setaclres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_setaclargs),
+ .pc_argzero = sizeof(struct nfsd3_setaclargs),
.pc_ressize = sizeof(struct nfsd3_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT,
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 936eebd4c56d..923d9a80df92 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/ext2_fs.h>
#include <linux/magic.h>
+#include <linux/namei.h>
#include "cache.h"
#include "xdr3.h"
@@ -66,12 +67,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
{
struct nfsd3_sattrargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
dprintk("nfsd: SETATTR(3) %s\n",
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- resp->status = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
+ resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs,
argp->check_guard, argp->guardtime);
return rpc_success;
}
@@ -146,7 +150,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
{
struct nfsd3_readargs *argp = rqstp->rq_argp;
struct nfsd3_readres *resp = rqstp->rq_resp;
- u32 max_blocksize = svc_max_payload(rqstp);
unsigned int len;
int v;
@@ -155,7 +158,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
(unsigned long) argp->count,
(unsigned long long) argp->offset);
- argp->count = min_t(u32, argp->count, max_blocksize);
+ argp->count = min_t(u32, argp->count, svc_max_payload(rqstp));
+ argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
if (argp->offset > (u64)OFFSET_MAX)
argp->offset = (u64)OFFSET_MAX;
if (argp->offset + argp->count > (u64)OFFSET_MAX)
@@ -220,17 +224,137 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
}
/*
- * With NFSv3, CREATE processing is a lot easier than with NFSv2.
- * At least in theory; we'll see how it fares in practice when the
- * first reports about SunOS compatibility problems start to pour in...
+ * Implement NFSv3's unchecked, guarded, and exclusive CREATE
+ * semantics for regular files. Except for the created file,
+ * this operation is stateless on the server.
+ *
+ * Upon return, caller must release @fhp and @resfhp.
*/
static __be32
+nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd3_createargs *argp)
+{
+ struct iattr *iap = &argp->attrs;
+ struct dentry *parent, *child;
+ struct nfsd_attrs attrs = {
+ .na_iattr = iap,
+ };
+ __u32 v_mtime, v_atime;
+ struct inode *inode;
+ __be32 status;
+ int host_err;
+
+ if (isdotent(argp->name, argp->len))
+ return nfserr_exist;
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (status != nfs_ok)
+ return status;
+
+ parent = fhp->fh_dentry;
+ inode = d_inode(parent);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ inode_lock_nested(inode, I_MUTEX_PARENT);
+
+ child = lookup_one_len(argp->name, parent, argp->len);
+ if (IS_ERR(child)) {
+ status = nfserrno(PTR_ERR(child));
+ goto out;
+ }
+
+ if (d_really_is_negative(child)) {
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (status != nfs_ok)
+ goto out;
+ }
+
+ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
+ if (status != nfs_ok)
+ goto out;
+
+ v_mtime = 0;
+ v_atime = 0;
+ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
+ u32 *verifier = (u32 *)argp->verf;
+
+ /*
+ * Solaris 7 gets confused (bugid 4218508) if these have
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits.
+ */
+ v_mtime = verifier[0] & 0x7fffffff;
+ v_atime = verifier[1] & 0x7fffffff;
+ }
+
+ if (d_really_is_positive(child)) {
+ status = nfs_ok;
+
+ switch (argp->createmode) {
+ case NFS3_CREATE_UNCHECKED:
+ if (!d_is_reg(child))
+ break;
+ iap->ia_valid &= ATTR_SIZE;
+ goto set_attr;
+ case NFS3_CREATE_GUARDED:
+ status = nfserr_exist;
+ break;
+ case NFS3_CREATE_EXCLUSIVE:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ break;
+ }
+ status = nfserr_exist;
+ }
+ goto out;
+ }
+
+ if (!IS_POSIXACL(inode))
+ iap->ia_mode &= ~current_umask();
+
+ fh_fill_pre_attrs(fhp);
+ host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true);
+ if (host_err < 0) {
+ status = nfserrno(host_err);
+ goto out;
+ }
+ fh_fill_post_attrs(fhp);
+
+ /* A newly created file already has a file size of zero. */
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
+ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
+ ATTR_MTIME_SET | ATTR_ATIME_SET;
+ iap->ia_mtime.tv_sec = v_mtime;
+ iap->ia_atime.tv_sec = v_atime;
+ iap->ia_mtime.tv_nsec = 0;
+ iap->ia_atime.tv_nsec = 0;
+ }
+
+set_attr:
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
+
+out:
+ inode_unlock(inode);
+ if (child && !IS_ERR(child))
+ dput(child);
+ fh_drop_write(fhp);
+ return status;
+}
+
+static __be32
nfsd3_proc_create(struct svc_rqst *rqstp)
{
struct nfsd3_createargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- svc_fh *dirfhp, *newfhp = NULL;
- struct iattr *attr;
+ svc_fh *dirfhp, *newfhp;
dprintk("nfsd: CREATE(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -239,21 +363,8 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
dirfhp = fh_copy(&resp->dirfh, &argp->fh);
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
- attr = &argp->attrs;
-
- /* Unfudge the mode bits */
- attr->ia_mode &= ~S_IFMT;
- if (!(attr->ia_valid & ATTR_MODE)) {
- attr->ia_valid |= ATTR_MODE;
- attr->ia_mode = S_IFREG;
- } else {
- attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG;
- }
- /* Now create the file and set attributes */
- resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
- attr, newfhp, argp->createmode,
- (u32 *)argp->verf, NULL, NULL);
+ resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp);
return rpc_success;
}
@@ -265,6 +376,9 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
{
struct nfsd3_createargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
dprintk("nfsd: MKDIR(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -275,8 +389,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
- &argp->attrs, S_IFDIR, 0, &resp->fh);
- fh_unlock(&resp->dirfh);
+ &attrs, S_IFDIR, 0, &resp->fh);
return rpc_success;
}
@@ -285,6 +398,9 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
{
struct nfsd3_symlinkargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
if (argp->tlen == 0) {
resp->status = nfserr_inval;
@@ -311,7 +427,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
fh_copy(&resp->dirfh, &argp->ffh);
fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname,
- argp->flen, argp->tname, &resp->fh);
+ argp->flen, argp->tname, &attrs, &resp->fh);
kfree(argp->tname);
out:
return rpc_success;
@@ -325,6 +441,9 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
{
struct nfsd3_mknodargs *argp = rqstp->rq_argp;
struct nfsd3_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
int type;
dev_t rdev = 0;
@@ -350,8 +469,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
type = nfs3_ftypes[argp->ftype];
resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
- &argp->attrs, type, rdev, &resp->fh);
- fh_unlock(&resp->dirfh);
+ &attrs, type, rdev, &resp->fh);
out:
return rpc_success;
}
@@ -374,7 +492,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
argp->name, argp->len);
- fh_unlock(&resp->fh);
return rpc_success;
}
@@ -395,7 +512,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
argp->name, argp->len);
- fh_unlock(&resp->fh);
return rpc_success;
}
@@ -447,25 +563,18 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
{
struct xdr_buf *buf = &resp->dirlist;
struct xdr_stream *xdr = &resp->xdr;
-
- count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
+ unsigned int sendbuf = min_t(unsigned int, rqstp->rq_res.buflen,
+ svc_max_payload(rqstp));
memset(buf, 0, sizeof(*buf));
/* Reserve room for the NULL ptr & eof flag (-2 words) */
- buf->buflen = count - XDR_UNIT * 2;
+ buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), sendbuf);
+ buf->buflen -= XDR_UNIT * 2;
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
- /* This is xdr_init_encode(), but it assumes that
- * the head kvec has already been consumed. */
- xdr_set_scratch_buffer(xdr, NULL, 0);
- xdr->buf = buf;
- xdr->page_ptr = buf->pages;
- xdr->iov = NULL;
- xdr->p = page_address(*buf->pages);
- xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
- xdr->rqst = NULL;
+ xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
}
/*
@@ -692,6 +801,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
@@ -703,6 +813,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_getattrres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd3_attrstatres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -714,6 +825,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_wccstatres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_sattrargs),
+ .pc_argzero = sizeof(struct nfsd3_sattrargs),
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
@@ -725,6 +837,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_lookupres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_argzero = sizeof(struct nfsd3_diropargs),
.pc_ressize = sizeof(struct nfsd3_diropres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+FH+pAT+pAT,
@@ -736,6 +849,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_accessres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_accessargs),
+ .pc_argzero = sizeof(struct nfsd3_accessargs),
.pc_ressize = sizeof(struct nfsd3_accessres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+1,
@@ -747,6 +861,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readlinkres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd3_readlinkres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
@@ -758,6 +873,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_readargs),
+ .pc_argzero = sizeof(struct nfsd3_readargs),
.pc_ressize = sizeof(struct nfsd3_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
@@ -769,6 +885,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_writeres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_writeargs),
+ .pc_argzero = sizeof(struct nfsd3_writeargs),
.pc_ressize = sizeof(struct nfsd3_writeres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC+4,
@@ -780,6 +897,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_createargs),
+ .pc_argzero = sizeof(struct nfsd3_createargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -791,6 +909,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_mkdirargs),
+ .pc_argzero = sizeof(struct nfsd3_mkdirargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -802,6 +921,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_symlinkargs),
+ .pc_argzero = sizeof(struct nfsd3_symlinkargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -813,6 +933,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_createres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_mknodargs),
+ .pc_argzero = sizeof(struct nfsd3_mknodargs),
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -824,6 +945,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_wccstatres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_argzero = sizeof(struct nfsd3_diropargs),
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
@@ -835,6 +957,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_wccstatres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_diropargs),
+ .pc_argzero = sizeof(struct nfsd3_diropargs),
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
@@ -846,6 +969,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_renameres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_renameargs),
+ .pc_argzero = sizeof(struct nfsd3_renameargs),
.pc_ressize = sizeof(struct nfsd3_renameres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC+WC,
@@ -857,6 +981,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_linkres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_linkargs),
+ .pc_argzero = sizeof(struct nfsd3_linkargs),
.pc_ressize = sizeof(struct nfsd3_linkres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+pAT+WC,
@@ -868,6 +993,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readdirres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_readdirargs),
+ .pc_argzero = sizeof(struct nfsd3_readdirargs),
.pc_ressize = sizeof(struct nfsd3_readdirres),
.pc_cachetype = RC_NOCACHE,
.pc_name = "READDIR",
@@ -878,6 +1004,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_readdirres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+ .pc_argzero = sizeof(struct nfsd3_readdirplusargs),
.pc_ressize = sizeof(struct nfsd3_readdirres),
.pc_cachetype = RC_NOCACHE,
.pc_name = "READDIRPLUS",
@@ -887,6 +1014,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_fsstatres,
.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
.pc_ressize = sizeof(struct nfsd3_fsstatres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+2*6+1,
@@ -897,6 +1025,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_fsinfores,
.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
.pc_ressize = sizeof(struct nfsd3_fsinfores),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+12,
@@ -907,6 +1036,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_pathconfres,
.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
.pc_ressize = sizeof(struct nfsd3_pathconfres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+6,
@@ -918,6 +1048,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_encode = nfs3svc_encode_commitres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd3_commitargs),
+ .pc_argzero = sizeof(struct nfsd3_commitargs),
.pc_ressize = sizeof(struct nfsd3_commitres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+WC+2,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 0293b8d65f10..3308dd671ef0 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -571,10 +571,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
args->count = max_blocksize;
args->len = max_blocksize;
}
- if (!xdr_stream_subsegment(xdr, &args->payload, args->count))
- return false;
- return true;
+ return xdr_stream_subsegment(xdr, &args->payload, args->count);
}
bool
@@ -616,8 +614,6 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd3_symlinkargs *args = rqstp->rq_argp;
struct kvec *head = rqstp->rq_arg.head;
- struct kvec *tail = rqstp->rq_arg.tail;
- size_t remaining;
if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen))
return false;
@@ -626,16 +622,10 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
return false;
- /* request sanity */
- remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
- remaining -= xdr_stream_pos(xdr);
- if (remaining < xdr_align_size(args->tlen))
- return false;
-
- args->first.iov_base = xdr->p;
+ /* symlink_data */
args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
-
- return true;
+ args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
+ return args->first.iov_base != NULL;
}
bool
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index eaa3a0cf38f1..bb8e2f6d7d03 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -751,58 +751,26 @@ out_estate:
return ret;
}
-__be32
-nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfs4_acl *acl)
+__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl,
+ struct nfsd_attrs *attr)
{
- __be32 error;
int host_error;
- struct dentry *dentry;
- struct inode *inode;
- struct posix_acl *pacl = NULL, *dpacl = NULL;
unsigned int flags = 0;
- /* Get inode */
- error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
- if (error)
- return error;
-
- dentry = fhp->fh_dentry;
- inode = d_inode(dentry);
+ if (!acl)
+ return nfs_ok;
- if (S_ISDIR(inode->i_mode))
+ if (type == NF4DIR)
flags = NFS4_ACL_DIR;
- host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+ host_error = nfs4_acl_nfsv4_to_posix(acl, &attr->na_pacl,
+ &attr->na_dpacl, flags);
if (host_error == -EINVAL)
return nfserr_attrnotsupp;
- if (host_error < 0)
- goto out_nfserr;
-
- fh_lock(fhp);
-
- host_error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, pacl);
- if (host_error < 0)
- goto out_drop_lock;
-
- if (S_ISDIR(inode->i_mode)) {
- host_error = set_posix_acl(&init_user_ns, inode,
- ACL_TYPE_DEFAULT, dpacl);
- }
-
-out_drop_lock:
- fh_unlock(fhp);
-
- posix_acl_release(pacl);
- posix_acl_release(dpacl);
-out_nfserr:
- if (host_error == -EOPNOTSUPP)
- return nfserr_attrnotsupp;
else
return nfserrno(host_error);
}
-
static short
ace2type(struct nfs4_ace *ace)
{
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 11f8715d92d6..f0e69edf5f0f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -679,7 +679,7 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
* case NFS4_OK:
* write_response4 coa_resok4;
* default:
- * length4 coa_bytes_copied;
+ * length4 coa_bytes_copied;
* };
* struct CB_OFFLOAD4args {
* nfs_fh4 coa_fh;
@@ -688,21 +688,22 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
* };
*/
static void encode_offload_info4(struct xdr_stream *xdr,
- __be32 nfserr,
- const struct nfsd4_copy *cp)
+ const struct nfsd4_cb_offload *cbo)
{
__be32 *p;
p = xdr_reserve_space(xdr, 4);
- *p++ = nfserr;
- if (!nfserr) {
+ *p = cbo->co_nfserr;
+ switch (cbo->co_nfserr) {
+ case nfs_ok:
p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
p = xdr_encode_empty_array(p);
- p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written);
- *p++ = cpu_to_be32(cp->cp_res.wr_stable_how);
- p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data,
+ p = xdr_encode_hyper(p, cbo->co_res.wr_bytes_written);
+ *p++ = cpu_to_be32(cbo->co_res.wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, cbo->co_res.wr_verifier.data,
NFS4_VERIFIER_SIZE);
- } else {
+ break;
+ default:
p = xdr_reserve_space(xdr, 8);
/* We always return success if bytes were written */
p = xdr_encode_hyper(p, 0);
@@ -710,18 +711,16 @@ static void encode_offload_info4(struct xdr_stream *xdr,
}
static void encode_cb_offload4args(struct xdr_stream *xdr,
- __be32 nfserr,
- const struct knfsd_fh *fh,
- const struct nfsd4_copy *cp,
+ const struct nfsd4_cb_offload *cbo,
struct nfs4_cb_compound_hdr *hdr)
{
__be32 *p;
p = xdr_reserve_space(xdr, 4);
- *p++ = cpu_to_be32(OP_CB_OFFLOAD);
- encode_nfs_fh4(xdr, fh);
- encode_stateid4(xdr, &cp->cp_res.cb_stateid);
- encode_offload_info4(xdr, nfserr, cp);
+ *p = cpu_to_be32(OP_CB_OFFLOAD);
+ encode_nfs_fh4(xdr, &cbo->co_fh);
+ encode_stateid4(xdr, &cbo->co_res.cb_stateid);
+ encode_offload_info4(xdr, cbo);
hdr->nops++;
}
@@ -731,8 +730,8 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
const void *data)
{
const struct nfsd4_callback *cb = data;
- const struct nfsd4_copy *cp =
- container_of(cb, struct nfsd4_copy, cp_cb);
+ const struct nfsd4_cb_offload *cbo =
+ container_of(cb, struct nfsd4_cb_offload, co_cb);
struct nfs4_cb_compound_hdr hdr = {
.ident = 0,
.minorversion = cb->cb_clp->cl_minorversion,
@@ -740,7 +739,7 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
encode_cb_compound4args(xdr, &hdr);
encode_cb_sequence4args(xdr, cb, &hdr);
- encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr);
+ encode_cb_offload4args(xdr, cbo, &hdr);
encode_cb_nops(&hdr);
}
@@ -1372,11 +1371,21 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_holds_slot = false;
}
-void nfsd4_run_cb(struct nfsd4_callback *cb)
+/**
+ * nfsd4_run_cb - queue up a callback job to run
+ * @cb: callback to queue
+ *
+ * Kick off a callback to do its thing. Returns false if it was already
+ * on a queue, true otherwise.
+ */
+bool nfsd4_run_cb(struct nfsd4_callback *cb)
{
struct nfs4_client *clp = cb->cb_clp;
+ bool queued;
nfsd41_cb_inflight_begin(clp);
- if (!nfsd4_queue_cb(cb))
+ queued = nfsd4_queue_cb(cb);
+ if (!queued)
nfsd41_cb_inflight_end(clp);
+ return queued;
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f92161ce1f97..e70a1a2999b7 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -82,8 +82,8 @@ ent_init(struct cache_head *cnew, struct cache_head *citm)
new->id = itm->id;
new->type = itm->type;
- strlcpy(new->name, itm->name, sizeof(new->name));
- strlcpy(new->authname, itm->authname, sizeof(new->authname));
+ strscpy(new->name, itm->name, sizeof(new->name));
+ strscpy(new->authname, itm->authname, sizeof(new->authname));
}
static void
@@ -548,7 +548,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
return nfserr_badowner;
memcpy(key.name, name, namelen);
key.name[namelen] = '\0';
- strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item);
if (ret == -ENOENT)
return nfserr_badowner;
@@ -584,7 +584,7 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr,
int ret;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
if (ret == -ENOENT)
return encode_ascii_id(xdr, id);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2c05692a9abf..3564d1c6f610 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -658,7 +658,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
ktime_t now, cutoff;
const struct nfsd4_layout_ops *ops;
-
+ trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
switch (task->tk_status) {
case 0:
case -NFS4ERR_DELAY:
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b207c76a873f..8beb2bc4c328 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -37,6 +37,8 @@
#include <linux/falloc.h>
#include <linux/slab.h>
#include <linux/kthread.h>
+#include <linux/namei.h>
+
#include <linux/sunrpc/addr.h>
#include <linux/nfs_ssc.h>
@@ -62,36 +64,6 @@ MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
"idle msecs before unmount export from source server");
#endif
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-#include <linux/security.h>
-
-static inline void
-nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
-{
- struct inode *inode = d_inode(resfh->fh_dentry);
- int status;
-
- inode_lock(inode);
- status = security_inode_setsecctx(resfh->fh_dentry,
- label->data, label->len);
- inode_unlock(inode);
-
- if (status)
- /*
- * XXX: We should really fail the whole open, but we may
- * already have created a new file, so it may be too
- * late. For now this seems the least of evils:
- */
- bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
-
- return;
-}
-#else
-static inline void
-nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
-{ }
-#endif
-
#define NFSDDBG_FACILITY NFSDDBG_PROC
static u32 nfsd_attrmask[] = {
@@ -156,26 +128,6 @@ is_create_with_attrs(struct nfsd4_open *open)
|| open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
}
-/*
- * if error occurs when setting the acl, just clear the acl bit
- * in the returned attr bitmap.
- */
-static void
-do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfs4_acl *acl, u32 *bmval)
-{
- __be32 status;
-
- status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
- if (status)
- /*
- * We should probably fail the whole open at this point,
- * but we've already created the file, so it's too late;
- * So this seems the least of evils:
- */
- bmval[0] &= ~FATTR4_WORD0_ACL;
-}
-
static inline void
fh_dup2(struct svc_fh *dst, struct svc_fh *src)
{
@@ -189,7 +141,6 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
static __be32
do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
{
- __be32 status;
if (open->op_truncate &&
!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
@@ -204,9 +155,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
if (open->op_share_deny & NFS4_SHARE_DENY_READ)
accmode |= NFSD_MAY_WRITE;
- status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
-
- return status;
+ return fh_verify(rqstp, current_fh, S_IFREG, accmode);
}
static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
@@ -235,6 +184,202 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
&resfh->fh_handle);
}
+static inline bool nfsd4_create_is_exclusive(int createmode)
+{
+ return createmode == NFS4_CREATE_EXCLUSIVE ||
+ createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
+static __be32
+nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child,
+ struct nfsd4_open *open)
+{
+ struct file *filp;
+ struct path path;
+ int oflags;
+
+ oflags = O_CREAT | O_LARGEFILE;
+ switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) {
+ case NFS4_SHARE_ACCESS_WRITE:
+ oflags |= O_WRONLY;
+ break;
+ case NFS4_SHARE_ACCESS_BOTH:
+ oflags |= O_RDWR;
+ break;
+ default:
+ oflags |= O_RDONLY;
+ }
+
+ path.mnt = fhp->fh_export->ex_path.mnt;
+ path.dentry = child;
+ filp = dentry_create(&path, oflags, open->op_iattr.ia_mode,
+ current_cred());
+ if (IS_ERR(filp))
+ return nfserrno(PTR_ERR(filp));
+
+ open->op_filp = filp;
+ return nfs_ok;
+}
+
+/*
+ * Implement NFSv4's unchecked, guarded, and exclusive create
+ * semantics for regular files. Open state for this new file is
+ * subsequently fabricated in nfsd4_process_open2().
+ *
+ * Upon return, caller must release @fhp and @resfhp.
+ */
+static __be32
+nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd4_open *open)
+{
+ struct iattr *iap = &open->op_iattr;
+ struct nfsd_attrs attrs = {
+ .na_iattr = iap,
+ .na_seclabel = &open->op_label,
+ };
+ struct dentry *parent, *child;
+ __u32 v_mtime, v_atime;
+ struct inode *inode;
+ __be32 status;
+ int host_err;
+
+ if (isdotent(open->op_fname, open->op_fnamelen))
+ return nfserr_exist;
+ if (!(iap->ia_valid & ATTR_MODE))
+ iap->ia_mode = 0;
+
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+ if (status != nfs_ok)
+ return status;
+ parent = fhp->fh_dentry;
+ inode = d_inode(parent);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ if (is_create_with_attrs(open))
+ nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs);
+
+ inode_lock_nested(inode, I_MUTEX_PARENT);
+
+ child = lookup_one_len(open->op_fname, parent, open->op_fnamelen);
+ if (IS_ERR(child)) {
+ status = nfserrno(PTR_ERR(child));
+ goto out;
+ }
+
+ if (d_really_is_negative(child)) {
+ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+ if (status != nfs_ok)
+ goto out;
+ }
+
+ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
+ if (status != nfs_ok)
+ goto out;
+
+ v_mtime = 0;
+ v_atime = 0;
+ if (nfsd4_create_is_exclusive(open->op_createmode)) {
+ u32 *verifier = (u32 *)open->op_verf.data;
+
+ /*
+ * Solaris 7 gets confused (bugid 4218508) if these have
+ * the high bit set, as do xfs filesystems without the
+ * "bigtime" feature. So just clear the high bits. If this
+ * is ever changed to use different attrs for storing the
+ * verifier, then do_open_lookup() will also need to be
+ * fixed accordingly.
+ */
+ v_mtime = verifier[0] & 0x7fffffff;
+ v_atime = verifier[1] & 0x7fffffff;
+ }
+
+ if (d_really_is_positive(child)) {
+ status = nfs_ok;
+
+ /* NFSv4 protocol requires change attributes even though
+ * no change happened.
+ */
+ fh_fill_both_attrs(fhp);
+
+ switch (open->op_createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ if (!d_is_reg(child))
+ break;
+
+ /*
+ * In NFSv4, we don't want to truncate the file
+ * now. This would be wrong if the OPEN fails for
+ * some other reason. Furthermore, if the size is
+ * nonzero, we should ignore it according to spec!
+ */
+ open->op_truncate = (iap->ia_valid & ATTR_SIZE) &&
+ !iap->ia_size;
+ break;
+ case NFS4_CREATE_GUARDED:
+ status = nfserr_exist;
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ open->op_created = true;
+ break; /* subtle */
+ }
+ status = nfserr_exist;
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
+ d_inode(child)->i_atime.tv_sec == v_atime &&
+ d_inode(child)->i_size == 0) {
+ open->op_created = true;
+ goto set_attr; /* subtle */
+ }
+ status = nfserr_exist;
+ }
+ goto out;
+ }
+
+ if (!IS_POSIXACL(inode))
+ iap->ia_mode &= ~current_umask();
+
+ fh_fill_pre_attrs(fhp);
+ status = nfsd4_vfs_create(fhp, child, open);
+ if (status != nfs_ok)
+ goto out;
+ open->op_created = true;
+ fh_fill_post_attrs(fhp);
+
+ /* A newly created file already has a file size of zero. */
+ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+ iap->ia_valid &= ~ATTR_SIZE;
+ if (nfsd4_create_is_exclusive(open->op_createmode)) {
+ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
+ ATTR_MTIME_SET|ATTR_ATIME_SET;
+ iap->ia_mtime.tv_sec = v_mtime;
+ iap->ia_atime.tv_sec = v_atime;
+ iap->ia_mtime.tv_nsec = 0;
+ iap->ia_atime.tv_nsec = 0;
+ }
+
+set_attr:
+ status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
+
+ if (attrs.na_labelerr)
+ open->op_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (attrs.na_aclerr)
+ open->op_bmval[0] &= ~FATTR4_WORD0_ACL;
+out:
+ inode_unlock(inode);
+ nfsd_attrs_free(&attrs);
+ if (child && !IS_ERR(child))
+ dput(child);
+ fh_drop_write(fhp);
+ return status;
+}
+
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
{
@@ -264,47 +409,33 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* yes | yes | GUARDED4 | GUARDED4
*/
- /*
- * Note: create modes (UNCHECKED,GUARDED...) are the same
- * in NFSv4 as in v3 except EXCLUSIVE4_1.
- */
current->fs->umask = open->op_umask;
- status = do_nfsd_create(rqstp, current_fh, open->op_fname,
- open->op_fnamelen, &open->op_iattr,
- *resfh, open->op_createmode,
- (u32 *)open->op_verf.data,
- &open->op_truncate, &open->op_created);
+ status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
current->fs->umask = 0;
- if (!status && open->op_label.len)
- nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
-
/*
* Following rfc 3530 14.2.16, and rfc 5661 18.16.4
* use the returned bitmask to indicate which attributes
* we used to store the verifier:
*/
- if (nfsd_create_is_exclusive(open->op_createmode) && status == 0)
+ if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
FATTR4_WORD1_TIME_MODIFY);
- } else
- /*
- * Note this may exit with the parent still locked.
- * We will hold the lock until nfsd4_open's final
- * lookup, to prevent renames or unlinks until we've had
- * a chance to an acquire a delegation if appropriate.
- */
+ } else {
status = nfsd_lookup(rqstp, current_fh,
open->op_fname, open->op_fnamelen, *resfh);
+ if (!status)
+ /* NFSv4 protocol requires change attributes even though
+ * no change happened.
+ */
+ fh_fill_both_attrs(current_fh);
+ }
if (status)
goto out;
status = nfsd_check_obj_isreg(*resfh);
if (status)
goto out;
- if (is_create_with_attrs(open) && open->op_acl != NULL)
- do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
-
nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
accmode = NFSD_MAY_NOP;
if (open->op_created ||
@@ -320,7 +451,6 @@ static __be32
do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
{
struct svc_fh *current_fh = &cstate->current_fh;
- __be32 status;
int accmode = 0;
/* We don't know the target directory, and therefore can not
@@ -345,9 +475,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, str
if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
accmode = NFSD_MAY_OWNER_OVERRIDE;
- status = do_open_permission(rqstp, current_fh, open, accmode);
-
- return status;
+ return do_open_permission(rqstp, current_fh, open, accmode);
}
static void
@@ -375,6 +503,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
(int)open->op_fnamelen, open->op_fname,
open->op_openowner);
+ open->op_filp = NULL;
+ open->op_rqstp = rqstp;
+
/* This check required by spec. */
if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
return nfserr_inval;
@@ -427,50 +558,46 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
switch (open->op_claim_type) {
- case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- case NFS4_OPEN_CLAIM_NULL:
- status = do_open_lookup(rqstp, cstate, open, &resfh);
- if (status)
- goto out;
- break;
- case NFS4_OPEN_CLAIM_PREVIOUS:
- status = nfs4_check_open_reclaim(cstate->clp);
- if (status)
- goto out;
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
- reclaim = true;
- fallthrough;
- case NFS4_OPEN_CLAIM_FH:
- case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
- status = do_open_fhandle(rqstp, cstate, open);
- if (status)
- goto out;
- resfh = &cstate->current_fh;
- break;
- case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
- case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- dprintk("NFSD: unsupported OPEN claim type %d\n",
- open->op_claim_type);
- status = nfserr_notsupp;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_NULL:
+ status = do_open_lookup(rqstp, cstate, open, &resfh);
+ if (status)
+ goto out;
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ status = nfs4_check_open_reclaim(cstate->clp);
+ if (status)
goto out;
- default:
- dprintk("NFSD: Invalid OPEN claim type %d\n",
- open->op_claim_type);
- status = nfserr_inval;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+ reclaim = true;
+ fallthrough;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ status = do_open_fhandle(rqstp, cstate, open);
+ if (status)
goto out;
+ resfh = &cstate->current_fh;
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ status = nfserr_notsupp;
+ goto out;
+ default:
+ status = nfserr_inval;
+ goto out;
}
- /*
- * nfsd4_process_open2() does the actual opening of the file. If
- * successful, it (1) truncates the file if open->op_truncate was
- * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
- */
+
status = nfsd4_process_open2(rqstp, resfh, open);
- WARN(status && open->op_created,
- "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
- be32_to_cpu(status));
+ if (status && open->op_created)
+ pr_warn("nfsd4_process_open2 failed to open newly-created file: status=%u\n",
+ be32_to_cpu(status));
if (reclaim && !status)
nn->somebody_reclaimed = true;
out:
+ if (open->op_filp) {
+ fput(open->op_filp);
+ open->op_filp = NULL;
+ }
if (resfh && resfh != &cstate->current_fh) {
fh_dup2(&cstate->current_fh, resfh);
fh_put(resfh);
@@ -535,11 +662,9 @@ static __be32
nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
- __be32 status;
-
fh_put(&cstate->current_fh);
- status = exp_pseudoroot(rqstp, &cstate->current_fh);
- return status;
+
+ return exp_pseudoroot(rqstp, &cstate->current_fh);
}
static __be32
@@ -617,6 +742,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_create *create = &u->create;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &create->cr_iattr,
+ .na_seclabel = &create->cr_label,
+ };
struct svc_fh resfh;
__be32 status;
dev_t rdev;
@@ -632,12 +761,13 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
return status;
+ status = nfsd4_acl_to_attr(create->cr_type, create->cr_acl, &attrs);
current->fs->umask = create->cr_umask;
switch (create->cr_type) {
case NF4LNK:
status = nfsd_symlink(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- create->cr_data, &resfh);
+ create->cr_data, &attrs, &resfh);
break;
case NF4BLK:
@@ -648,7 +778,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out_umask;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFBLK, rdev, &resfh);
+ &attrs, S_IFBLK, rdev, &resfh);
break;
case NF4CHR:
@@ -659,26 +789,26 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out_umask;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr,S_IFCHR, rdev, &resfh);
+ &attrs, S_IFCHR, rdev, &resfh);
break;
case NF4SOCK:
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFSOCK, 0, &resfh);
+ &attrs, S_IFSOCK, 0, &resfh);
break;
case NF4FIFO:
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFIFO, 0, &resfh);
+ &attrs, S_IFIFO, 0, &resfh);
break;
case NF4DIR:
create->cr_iattr.ia_valid &= ~ATTR_SIZE;
status = nfsd_create(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
- &create->cr_iattr, S_IFDIR, 0, &resfh);
+ &attrs, S_IFDIR, 0, &resfh);
break;
default:
@@ -688,20 +818,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- if (create->cr_label.len)
- nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
-
- if (create->cr_acl != NULL)
- do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
- create->cr_bmval);
-
- fh_unlock(&cstate->current_fh);
+ if (attrs.na_labelerr)
+ create->cr_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (attrs.na_aclerr)
+ create->cr_bmval[0] &= ~FATTR4_WORD0_ACL;
set_change_info(&create->cr_cinfo, &cstate->current_fh);
fh_dup2(&cstate->current_fh, &resfh);
out:
fh_put(&resfh);
out_umask:
current->fs->umask = 0;
+ nfsd_attrs_free(&attrs);
return status;
}
@@ -801,7 +928,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* the client wants us to do more in this compound:
*/
if (!nfsd4_last_compound_op(rqstp))
- clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
@@ -874,10 +1001,8 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
- if (!status) {
- fh_unlock(&cstate->current_fh);
+ if (!status)
set_change_info(&remove->rm_cinfo, &cstate->current_fh);
- }
return status;
}
@@ -917,7 +1042,6 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&exp, &dentry);
if (err)
return err;
- fh_unlock(&cstate->current_fh);
if (d_really_is_negative(dentry)) {
exp_put(exp);
err = nfserr_noent;
@@ -972,6 +1096,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_setattr *setattr = &u->setattr;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &setattr->sa_iattr,
+ .na_seclabel = &setattr->sa_label,
+ };
+ struct inode *inode;
__be32 status = nfs_ok;
int err;
@@ -994,19 +1123,18 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto out;
- if (setattr->sa_acl != NULL)
- status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
- setattr->sa_acl);
- if (status)
- goto out;
- if (setattr->sa_label.len)
- status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
- &setattr->sa_label);
+ inode = cstate->current_fh.fh_dentry->d_inode;
+ status = nfsd4_acl_to_attr(S_ISDIR(inode->i_mode) ? NF4DIR : NF4REG,
+ setattr->sa_acl, &attrs);
+
if (status)
goto out;
- status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
+ status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs,
0, (time64_t)0);
+ if (!status)
+ status = nfserrno(attrs.na_labelerr);
out:
+ nfsd_attrs_free(&attrs);
fh_drop_write(&cstate->current_fh);
return status;
}
@@ -1116,30 +1244,17 @@ out:
return status;
}
-void nfs4_put_copy(struct nfsd4_copy *copy)
+static void nfs4_put_copy(struct nfsd4_copy *copy)
{
if (!refcount_dec_and_test(&copy->refcount))
return;
+ kfree(copy->cp_src);
kfree(copy);
}
-static bool
-check_and_set_stop_copy(struct nfsd4_copy *copy)
-{
- bool value;
-
- spin_lock(&copy->cp_clp->async_lock);
- value = copy->stopped;
- if (!copy->stopped)
- copy->stopped = true;
- spin_unlock(&copy->cp_clp->async_lock);
- return value;
-}
-
static void nfsd4_stop_copy(struct nfsd4_copy *copy)
{
- /* only 1 thread should stop the copy */
- if (!check_and_set_stop_copy(copy))
+ if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, &copy->cp_flags))
kthread_stop(copy->copy_task);
nfs4_put_copy(copy);
}
@@ -1220,7 +1335,7 @@ try_again:
return 0;
}
if (work) {
- strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr));
+ strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
refcount_set(&work->nsui_refcnt, 2);
work->nsui_busy = true;
list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
@@ -1380,7 +1495,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
if (status)
goto out;
- status = nfsd4_interssc_connect(&copy->cp_src, rqstp, mount);
+ status = nfsd4_interssc_connect(copy->cp_src, rqstp, mount);
if (status)
goto out;
@@ -1398,7 +1513,7 @@ out:
}
static void
-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
+nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
struct nfsd_file *dst)
{
bool found = false;
@@ -1407,9 +1522,9 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id);
- nfs42_ssc_close(src->nf_file);
+ nfs42_ssc_close(filp);
nfsd_file_put(dst);
- fput(src->nf_file);
+ fput(filp);
if (!nn) {
mntput(ss_mnt);
@@ -1452,7 +1567,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
}
static void
-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
+nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
struct nfsd_file *dst)
{
}
@@ -1489,14 +1604,19 @@ nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst)
static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
{
- struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb);
+ struct nfsd4_cb_offload *cbo =
+ container_of(cb, struct nfsd4_cb_offload, co_cb);
- nfs4_put_copy(copy);
+ kfree(cbo);
}
static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
struct rpc_task *task)
{
+ struct nfsd4_cb_offload *cbo =
+ container_of(cb, struct nfsd4_cb_offload, co_cb);
+
+ trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task);
return 1;
}
@@ -1508,15 +1628,16 @@ static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
{
copy->cp_res.wr_stable_how =
- copy->committed ? NFS_FILE_SYNC : NFS_UNSTABLE;
- copy->cp_synchronous = sync;
+ test_bit(NFSD4_COPY_F_COMMITTED, &copy->cp_flags) ?
+ NFS_FILE_SYNC : NFS_UNSTABLE;
+ nfsd4_copy_set_sync(copy, sync);
gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
}
-static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
+static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
+ struct file *dst,
+ struct file *src)
{
- struct file *dst = copy->nf_dst->nf_file;
- struct file *src = copy->nf_src->nf_file;
errseq_t since;
ssize_t bytes_copied = 0;
u64 bytes_total = copy->cp_count;
@@ -1538,26 +1659,29 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
copy->cp_res.wr_bytes_written += bytes_copied;
src_pos += bytes_copied;
dst_pos += bytes_copied;
- } while (bytes_total > 0 && !copy->cp_synchronous);
+ } while (bytes_total > 0 && nfsd4_copy_is_async(copy));
/* for a non-zero asynchronous copy do a commit of data */
- if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) {
+ if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) {
since = READ_ONCE(dst->f_wb_err);
status = vfs_fsync_range(dst, copy->cp_dst_pos,
copy->cp_res.wr_bytes_written, 0);
if (!status)
status = filemap_check_wb_err(dst->f_mapping, since);
if (!status)
- copy->committed = true;
+ set_bit(NFSD4_COPY_F_COMMITTED, &copy->cp_flags);
}
return bytes_copied;
}
-static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
+static __be32 nfsd4_do_copy(struct nfsd4_copy *copy,
+ struct file *src, struct file *dst,
+ bool sync)
{
__be32 status;
ssize_t bytes;
- bytes = _nfsd_copy_file_range(copy);
+ bytes = _nfsd_copy_file_range(copy, dst, src);
+
/* for async copy, we ignore the error, client can always retry
* to get the error
*/
@@ -1567,13 +1691,6 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
nfsd4_init_copy_res(copy, sync);
status = nfs_ok;
}
-
- if (!copy->cp_intra) /* Inter server SSC */
- nfsd4_cleanup_inter_ssc(copy->ss_mnt, copy->nf_src,
- copy->nf_dst);
- else
- nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
-
return status;
}
@@ -1582,17 +1699,17 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
dst->cp_src_pos = src->cp_src_pos;
dst->cp_dst_pos = src->cp_dst_pos;
dst->cp_count = src->cp_count;
- dst->cp_synchronous = src->cp_synchronous;
+ dst->cp_flags = src->cp_flags;
memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
memcpy(&dst->fh, &src->fh, sizeof(src->fh));
dst->cp_clp = src->cp_clp;
dst->nf_dst = nfsd_file_get(src->nf_dst);
- dst->cp_intra = src->cp_intra;
- if (src->cp_intra) /* for inter, file_src doesn't exist yet */
+ /* for inter, nf_src doesn't exist yet */
+ if (!nfsd4_ssc_is_inter(src))
dst->nf_src = nfsd_file_get(src->nf_src);
memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
- memcpy(&dst->cp_src, &src->cp_src, sizeof(struct nl4_server));
+ memcpy(dst->cp_src, src->cp_src, sizeof(struct nl4_server));
memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid));
memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh));
dst->ss_mnt = src->ss_mnt;
@@ -1602,7 +1719,7 @@ static void cleanup_async_copy(struct nfsd4_copy *copy)
{
nfs4_free_copy_state(copy);
nfsd_file_put(copy->nf_dst);
- if (copy->cp_intra)
+ if (!nfsd4_ssc_is_inter(copy))
nfsd_file_put(copy->nf_src);
spin_lock(&copy->cp_clp->async_lock);
list_del(&copy->copies);
@@ -1610,45 +1727,64 @@ static void cleanup_async_copy(struct nfsd4_copy *copy)
nfs4_put_copy(copy);
}
+static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr)
+{
+ struct nfsd4_cb_offload *cbo;
+
+ cbo = kzalloc(sizeof(*cbo), GFP_KERNEL);
+ if (!cbo)
+ return;
+
+ memcpy(&cbo->co_res, &copy->cp_res, sizeof(copy->cp_res));
+ memcpy(&cbo->co_fh, &copy->fh, sizeof(copy->fh));
+ cbo->co_nfserr = nfserr;
+
+ nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops,
+ NFSPROC4_CLNT_CB_OFFLOAD);
+ trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
+ &cbo->co_fh, copy->cp_count, nfserr);
+ nfsd4_run_cb(&cbo->co_cb);
+}
+
+/**
+ * nfsd4_do_async_copy - kthread function for background server-side COPY
+ * @data: arguments for COPY operation
+ *
+ * Return values:
+ * %0: Copy operation is done.
+ */
static int nfsd4_do_async_copy(void *data)
{
struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
- struct nfsd4_copy *cb_copy;
+ __be32 nfserr;
- if (!copy->cp_intra) { /* Inter server SSC */
- copy->nf_src = kzalloc(sizeof(struct nfsd_file), GFP_KERNEL);
- if (!copy->nf_src) {
- copy->nfserr = nfserr_serverfault;
- nfsd4_interssc_disconnect(copy->ss_mnt);
- goto do_callback;
- }
- copy->nf_src->nf_file = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
- &copy->stateid);
- if (IS_ERR(copy->nf_src->nf_file)) {
- copy->nfserr = nfserr_offload_denied;
+ if (nfsd4_ssc_is_inter(copy)) {
+ struct file *filp;
+
+ filp = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
+ &copy->stateid);
+ if (IS_ERR(filp)) {
+ switch (PTR_ERR(filp)) {
+ case -EBADF:
+ nfserr = nfserr_wrong_type;
+ break;
+ default:
+ nfserr = nfserr_offload_denied;
+ }
nfsd4_interssc_disconnect(copy->ss_mnt);
goto do_callback;
}
+ nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
+ false);
+ nfsd4_cleanup_inter_ssc(copy->ss_mnt, filp, copy->nf_dst);
+ } else {
+ nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
+ copy->nf_dst->nf_file, false);
+ nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
}
- copy->nfserr = nfsd4_do_copy(copy, 0);
do_callback:
- cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
- if (!cb_copy)
- goto out;
- refcount_set(&cb_copy->refcount, 1);
- memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
- cb_copy->cp_clp = copy->cp_clp;
- cb_copy->nfserr = copy->nfserr;
- memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
- nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
- &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
- trace_nfsd_cb_offload(copy->cp_clp, &copy->cp_res.cb_stateid,
- &copy->fh, copy->cp_count, copy->nfserr);
- nfsd4_run_cb(&cb_copy->cp_cb);
-out:
- if (!copy->cp_intra)
- kfree(copy->nf_src);
+ nfsd4_send_cb_offload(copy, nfserr);
cleanup_async_copy(copy);
return 0;
}
@@ -1661,8 +1797,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd4_copy *async_copy = NULL;
- if (!copy->cp_intra) { /* Inter server SSC */
- if (!inter_copy_offload_enable || copy->cp_synchronous) {
+ if (nfsd4_ssc_is_inter(copy)) {
+ if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) {
status = nfserr_notsupp;
goto out;
}
@@ -1679,17 +1815,20 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
copy->cp_clp = cstate->clp;
memcpy(&copy->fh, &cstate->current_fh.fh_handle,
sizeof(struct knfsd_fh));
- if (!copy->cp_synchronous) {
+ if (nfsd4_copy_is_async(copy)) {
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
status = nfserrno(-ENOMEM);
async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
if (!async_copy)
goto out_err;
+ async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
+ if (!async_copy->cp_src)
+ goto out_err;
if (!nfs4_init_copy_state(nn, copy))
goto out_err;
refcount_set(&async_copy->refcount, 1);
- memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.stid,
+ memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.cs_stid,
sizeof(copy->cp_res.cb_stateid));
dup_copy_fields(copy, async_copy);
async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
@@ -1703,7 +1842,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
wake_up_process(async_copy->copy_task);
status = nfs_ok;
} else {
- status = nfsd4_do_copy(copy, 1);
+ status = nfsd4_do_copy(copy, copy->nf_src->nf_file,
+ copy->nf_dst->nf_file, true);
+ nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
}
out:
return status;
@@ -1711,7 +1852,7 @@ out_err:
if (async_copy)
cleanup_async_copy(async_copy);
status = nfserrno(-ENOMEM);
- if (!copy->cp_intra)
+ if (nfsd4_ssc_is_inter(copy))
nfsd4_interssc_disconnect(copy->ss_mnt);
goto out;
}
@@ -1723,7 +1864,7 @@ find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
spin_lock(&clp->async_lock);
list_for_each_entry(copy, &clp->async_copies, copies) {
- if (memcmp(&copy->cp_stateid.stid, stateid, NFS4_STATEID_SIZE))
+ if (memcmp(&copy->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE))
continue;
refcount_inc(&copy->refcount);
spin_unlock(&clp->async_lock);
@@ -1777,16 +1918,16 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cps = nfs4_alloc_init_cpntf_state(nn, stid);
if (!cps)
goto out;
- memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t));
+ memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.cs_stid, sizeof(stateid_t));
memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t));
memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t));
/* For now, only return one server address in cpn_src, the
* address used by the client to connect to this server.
*/
- cn->cpn_src.nl4_type = NL4_NETADDR;
+ cn->cpn_src->nl4_type = NL4_NETADDR;
status = nfsd4_set_netaddr((struct sockaddr *)&rqstp->rq_daddr,
- &cn->cpn_src.u.nl4_addr);
+ &cn->cpn_src->u.nl4_addr);
WARN_ON_ONCE(status);
if (status) {
nfs4_put_cpntf_state(nn, cps);
@@ -2440,7 +2581,7 @@ check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
return;
}
putfh = (struct nfsd4_putfh *)&saved_op->u;
- if (!copy->cp_intra)
+ if (nfsd4_ssc_is_inter(copy))
putfh->no_verify = true;
}
}
@@ -2481,11 +2622,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
cstate->minorversion = args->minorversion;
fh_init(current_fh, NFS4_FHSIZE);
fh_init(save_fh, NFS4_FHSIZE);
+
/*
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
*/
- clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
/*
* According to RFC3010, this takes precedence over all other errors.
@@ -2493,9 +2635,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
status = nfserr_minor_vers_mismatch;
if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0)
goto out;
- status = nfserr_resource;
- if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
- goto out;
status = nfs41_check_op_ordering(args);
if (status) {
@@ -2508,10 +2647,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
rqstp->rq_lease_breaker = (void **)&cstate->clp;
- trace_nfsd_compound(rqstp, args->opcnt);
+ trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
+ if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
+ /* If there are still more operations to process,
+ * stop here and report NFS4ERR_RESOURCE. */
+ if (cstate->minorversion == 0 &&
+ args->client_opcnt > resp->opcnt) {
+ op->status = nfserr_resource;
+ goto encode_op;
+ }
+ }
+
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -2541,7 +2690,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) {
/*
* Don't execute this op if we couldn't encode a
- * succesful reply:
+ * successful reply:
*/
u32 plen = op->opdesc->op_rsize_bop(rqstp, op);
/*
@@ -2587,8 +2736,8 @@ encode_op:
status = op->status;
}
- trace_nfsd_compound_status(args->opcnt, resp->opcnt, status,
- nfsd4_op_name(op->opnum));
+ trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
+ status, nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
nfsd4_increment_op_stats(op->opnum);
@@ -2600,7 +2749,7 @@ encode_op:
out:
cstate->status = status;
/* Reset deferral mechanism for RPC deferrals */
- set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
return rpc_success;
}
@@ -2622,28 +2771,49 @@ out:
#define op_encode_channel_attrs_maxsz (6 + 1 + 1)
-static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+/*
+ * The _rsize() helpers are invoked by the NFSv4 COMPOUND decoder, which
+ * is called before sunrpc sets rq_res.buflen. Thus we have to compute
+ * the maximum payload size here, based on transport limits and the size
+ * of the remaining space in the rq_pages array.
+ */
+static u32 nfsd4_max_payload(const struct svc_rqst *rqstp)
+{
+ u32 buflen;
+
+ buflen = (rqstp->rq_page_end - rqstp->rq_next_page) * PAGE_SIZE;
+ buflen -= rqstp->rq_auth_slack;
+ buflen -= rqstp->rq_res.head[0].iov_len;
+ return min_t(u32, buflen, svc_max_payload(rqstp));
+}
+
+static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size) * sizeof(__be32);
}
-static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_status_stateid_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
}
-static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_access_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
/* ac_supported, ac_resp_access */
return (op_encode_hdr_size + 2)* sizeof(__be32);
}
-static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_commit_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_create_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz
+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
@@ -2654,17 +2824,17 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
* the op prematurely if the estimate is too large. We may turn off splice
* reads unnecessarily.
*/
-static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 *bmap = op->u.getattr.ga_bmval;
+ const u32 *bmap = op->u.getattr.ga_bmval;
u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
u32 ret = 0;
if (bmap0 & FATTR4_WORD0_ACL)
- return svc_max_payload(rqstp);
+ return nfsd4_max_payload(rqstp);
if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
- return svc_max_payload(rqstp);
+ return nfsd4_max_payload(rqstp);
if (bmap1 & FATTR4_WORD1_OWNER) {
ret += IDMAP_NAMESZ + 4;
@@ -2692,24 +2862,28 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
return ret;
}
-static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_getfh_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
}
-static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_link_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_lock_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_stateid_maxsz
+ op_encode_change_info_maxsz + 1
@@ -2717,20 +2891,18 @@ static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+ op_encode_delegation_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = 0, rlen = 0;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.read.rd_length, maxcount);
+ u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = svc_max_payload(rqstp);
- u32 rlen = min(op->u.read.rd_length, maxcount);
+ u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
/*
* If we detect that the file changed during hole encoding, then we
* recover by encoding the remaining reply as data. This means we need
@@ -2741,70 +2913,77 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = 0, rlen = 0;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.readdir.rd_maxcount, maxcount);
+ u32 rlen = min(op->u.readdir.rd_maxcount, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + op_encode_verifier_maxsz +
XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_readlink_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
}
-static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_remove_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_rename_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz
+ op_encode_change_info_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_sequence_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
}
-static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_test_stateid_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
* sizeof(__be32);
}
-static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_setattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_secinfo_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
(4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
}
-static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_setclientid_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
sizeof(__be32);
}
-static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_write_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_exchange_id_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
1 + 1 + /* eir_flags, spr_how */\
@@ -2818,14 +2997,16 @@ static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_o
0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
}
-static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_bind_conn_to_session_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + \
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
}
-static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_create_session_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + \
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
@@ -2834,7 +3015,8 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
op_encode_channel_attrs_maxsz) * sizeof(__be32);
}
-static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_copy_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* wr_callback */ +
@@ -2846,16 +3028,16 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1 /* cr_synchronous */) * sizeof(__be32);
}
-static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_offload_status_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
2 /* osr_count */ +
1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
}
-static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
3 /* cnr_lease_time */ +
@@ -2870,12 +3052,10 @@ static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
}
#ifdef CONFIG_NFSD_PNFS
-static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount = 0, rlen = 0;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+ u32 rlen = min(op->u.getdeviceinfo.gd_maxcount, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size +
1 /* gd_layout_type*/ +
@@ -2888,7 +3068,8 @@ static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4
* so we need to define an arbitrary upper bound here.
*/
#define MAX_LAYOUT_SIZE 128
-static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutget_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* logr_return_on_close */ +
@@ -2897,14 +3078,16 @@ static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op
MAX_LAYOUT_SIZE) * sizeof(__be32);
}
-static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutcommit_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* locr_newsize */ +
2 /* ns_size */) * sizeof(__be32);
}
-static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutreturn_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size +
1 /* lrs_stateid */ +
@@ -2913,41 +3096,36 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_
#endif /* CONFIG_NFSD_PNFS */
-static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + 3) * sizeof(__be32);
}
-static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount, rlen;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min_t(u32, XATTR_SIZE_MAX, maxcount);
+ u32 rlen = min_t(u32, XATTR_SIZE_MAX, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
}
-static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
- u32 maxcount, rlen;
-
- maxcount = svc_max_payload(rqstp);
- rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount);
+ u32 rlen = min(op->u.listxattrs.lsxa_maxcount, nfsd4_max_payload(rqstp));
return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
-static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp,
- struct nfsd4_op *op)
+static u32 nfsd4_removexattr_rsize(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
* sizeof(__be32);
@@ -3436,6 +3614,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 1,
@@ -3446,6 +3625,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_decode = nfs4svc_decode_compoundargs,
.pc_encode = nfs4svc_encode_compoundres,
.pc_argsize = sizeof(struct nfsd4_compoundargs),
+ .pc_argzero = offsetof(struct nfsd4_compoundargs, iops),
.pc_ressize = sizeof(struct nfsd4_compoundres),
.pc_release = nfsd4_release_compoundargs,
.pc_cachetype = RC_NOCACHE,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c634483d85d2..78b8cd9651d5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -266,7 +266,7 @@ struct nfs4_dir_ctx {
struct list_head names;
};
-static int
+static bool
nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -275,14 +275,14 @@ nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
struct name_list *entry;
if (namlen != HEXDIR_LEN - 1)
- return 0;
+ return true;
entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
if (entry == NULL)
- return -ENOMEM;
+ return false;
memcpy(entry->name, name, HEXDIR_LEN - 1);
entry->name[HEXDIR_LEN - 1] = '\0';
list_add(&entry->list, &ctx->names);
- return 0;
+ return true;
}
static int
@@ -807,16 +807,18 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
if (get_user(namelen, &ci->cc_name.cn_len))
return -EFAULT;
name.data = memdup_user(&ci->cc_name.cn_id, namelen);
- if (IS_ERR_OR_NULL(name.data))
- return -EFAULT;
+ if (IS_ERR(name.data))
+ return PTR_ERR(name.data);
name.len = namelen;
get_user(princhashlen, &ci->cc_princhash.cp_len);
if (princhashlen > 0) {
princhash.data = memdup_user(
&ci->cc_princhash.cp_data,
princhashlen);
- if (IS_ERR_OR_NULL(princhash.data))
- return -EFAULT;
+ if (IS_ERR(princhash.data)) {
+ kfree(name.data);
+ return PTR_ERR(princhash.data);
+ }
princhash.len = princhashlen;
} else
princhash.len = 0;
@@ -827,8 +829,8 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
if (get_user(namelen, &cnm->cn_len))
return -EFAULT;
name.data = memdup_user(&cnm->cn_id, namelen);
- if (IS_ERR_OR_NULL(name.data))
- return -EFAULT;
+ if (IS_ERR(name.data))
+ return PTR_ERR(name.data);
name.len = namelen;
}
if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 234e852fcdfa..198d7abf34e4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -125,6 +125,23 @@ static void free_session(struct nfsd4_session *);
static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static struct workqueue_struct *laundry_wq;
+
+int nfsd4_create_laundry_wq(void)
+{
+ int rc = 0;
+
+ laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
+ if (laundry_wq == NULL)
+ rc = -ENOMEM;
+ return rc;
+}
+
+void nfsd4_destroy_laundry_wq(void)
+{
+ destroy_workqueue(laundry_wq);
+}
+
static bool is_session_dead(struct nfsd4_session *ses)
{
return ses->se_flags & NFS4_SESSION_DEAD;
@@ -143,6 +160,13 @@ static bool is_client_expired(struct nfs4_client *clp)
return clp->cl_time == 0;
}
+static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn,
+ struct nfs4_client *clp)
+{
+ if (clp->cl_state != NFSD4_ACTIVE)
+ atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0);
+}
+
static __be32 get_client_locked(struct nfs4_client *clp)
{
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -152,6 +176,8 @@ static __be32 get_client_locked(struct nfs4_client *clp)
if (is_client_expired(clp))
return nfserr_expired;
atomic_inc(&clp->cl_rpc_users);
+ nfsd4_dec_courtesy_client_count(nn, clp);
+ clp->cl_state = NFSD4_ACTIVE;
return nfs_ok;
}
@@ -172,6 +198,8 @@ renew_client_locked(struct nfs4_client *clp)
list_move_tail(&clp->cl_lru, &nn->client_lru);
clp->cl_time = ktime_get_boottime_seconds();
+ nfsd4_dec_courtesy_client_count(nn, clp);
+ clp->cl_state = NFSD4_ACTIVE;
}
static void put_client_renew_locked(struct nfs4_client *clp)
@@ -338,6 +366,8 @@ nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
static int
nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
+ trace_nfsd_cb_notify_lock_done(&zero_stateid, task);
+
/*
* Since this is just an optimization, we don't try very hard if it
* turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
@@ -690,6 +720,57 @@ static unsigned int file_hashval(struct svc_fh *fh)
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+/*
+ * Check if courtesy clients have conflicting access and resolve it if possible
+ *
+ * access: is op_share_access if share_access is true.
+ * Check if access mode, op_share_access, would conflict with
+ * the current deny mode of the file 'fp'.
+ * access: is op_share_deny if share_access is false.
+ * Check if the deny mode, op_share_deny, would conflict with
+ * current access of the file 'fp'.
+ * stp: skip checking this entry.
+ * new_stp: normal open, not open upgrade.
+ *
+ * Function returns:
+ * false - access/deny mode conflict with normal client.
+ * true - no conflict or conflict with courtesy client(s) is resolved.
+ */
+static bool
+nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp,
+ struct nfs4_ol_stateid *stp, u32 access, bool share_access)
+{
+ struct nfs4_ol_stateid *st;
+ bool resolvable = true;
+ unsigned char bmap;
+ struct nfsd_net *nn;
+ struct nfs4_client *clp;
+
+ lockdep_assert_held(&fp->fi_lock);
+ list_for_each_entry(st, &fp->fi_stateids, st_perfile) {
+ /* ignore lock stateid */
+ if (st->st_openstp)
+ continue;
+ if (st == stp && new_stp)
+ continue;
+ /* check file access against deny mode or vice versa */
+ bmap = share_access ? st->st_deny_bmap : st->st_access_bmap;
+ if (!(access & bmap_to_share_mode(bmap)))
+ continue;
+ clp = st->st_stid.sc_client;
+ if (try_to_expire_client(clp))
+ continue;
+ resolvable = false;
+ break;
+ }
+ if (resolvable) {
+ clp = stp->st_stid.sc_client;
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ }
+ return resolvable;
+}
+
static void
__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
{
@@ -750,9 +831,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
swap(f2, fp->fi_fds[O_RDWR]);
spin_unlock(&fp->fi_lock);
if (f1)
- nfsd_file_put(f1);
+ nfsd_file_close(f1);
if (f2)
- nfsd_file_put(f2);
+ nfsd_file_close(f2);
}
}
@@ -893,19 +974,19 @@ out_free:
* Create a unique stateid_t to represent each COPY.
*/
static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
- unsigned char sc_type)
+ unsigned char cs_type)
{
int new_id;
- stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
- stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
- stid->sc_type = sc_type;
+ stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
+ stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+ stid->cs_type = cs_type;
idr_preload(GFP_KERNEL);
spin_lock(&nn->s2s_cp_lock);
new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT);
- stid->stid.si_opaque.so_id = new_id;
- stid->stid.si_generation = 1;
+ stid->cs_stid.si_opaque.so_id = new_id;
+ stid->cs_stid.si_generation = 1;
spin_unlock(&nn->s2s_cp_lock);
idr_preload_end();
if (new_id < 0)
@@ -927,7 +1008,7 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
if (!cps)
return NULL;
cps->cpntf_time = ktime_get_boottime_seconds();
- refcount_set(&cps->cp_stateid.sc_count, 1);
+ refcount_set(&cps->cp_stateid.cs_count, 1);
if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID))
goto out_free;
spin_lock(&nn->s2s_cp_lock);
@@ -943,11 +1024,11 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy)
{
struct nfsd_net *nn;
- WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID);
+ WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID);
nn = net_generic(copy->cp_clp->net, nfsd_net_id);
spin_lock(&nn->s2s_cp_lock);
idr_remove(&nn->s2s_cp_stateids,
- copy->cp_stateid.stid.si_opaque.so_id);
+ copy->cp_stateid.cs_stid.si_opaque.so_id);
spin_unlock(&nn->s2s_cp_lock);
}
@@ -979,6 +1060,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
static void nfs4_free_deleg(struct nfs4_stid *stid)
{
+ struct nfs4_delegation *dp = delegstateid(stid);
+
+ WARN_ON_ONCE(!list_empty(&stid->sc_cp_list));
+ WARN_ON_ONCE(!list_empty(&dp->dl_perfile));
+ WARN_ON_ONCE(!list_empty(&dp->dl_perclnt));
+ WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru));
kmem_cache_free(deleg_slab, stid);
atomic_long_dec(&num_delegations);
}
@@ -1061,7 +1148,6 @@ static void block_delegations(struct knfsd_fh *fh)
static struct nfs4_delegation *
alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
- struct svc_fh *current_fh,
struct nfs4_clnt_odstate *odstate)
{
struct nfs4_delegation *dp;
@@ -1071,7 +1157,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
n = atomic_long_inc_return(&num_delegations);
if (n < 0 || n > max_delegations)
goto out_dec;
- if (delegation_blocked(&current_fh->fh_handle))
+ if (delegation_blocked(&fp->fi_fhandle))
goto out_dec;
dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
if (dp == NULL)
@@ -1090,6 +1176,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
get_clnt_odstate(odstate);
dp->dl_type = NFS4_OPEN_DELEGATE_READ;
dp->dl_retries = 1;
+ dp->dl_recalled = false;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
&nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
get_nfs4_file(fp);
@@ -1392,6 +1479,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
release_all_access(stp);
if (stp->st_stateowner)
nfs4_put_stateowner(stp->st_stateowner);
+ WARN_ON(!list_empty(&stid->sc_cp_list));
kmem_cache_free(stateid_slab, stid);
}
@@ -1982,11 +2070,16 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
* This type of memory management is somewhat inefficient, but we use it
* anyway since SETCLIENTID is not a common operation.
*/
-static struct nfs4_client *alloc_client(struct xdr_netobj name)
+static struct nfs4_client *alloc_client(struct xdr_netobj name,
+ struct nfsd_net *nn)
{
struct nfs4_client *clp;
int i;
+ if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) {
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ return NULL;
+ }
clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
if (clp == NULL)
return NULL;
@@ -2004,6 +2097,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
idr_init(&clp->cl_stateids);
atomic_set(&clp->cl_rpc_users, 0);
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ clp->cl_state = NFSD4_ACTIVE;
+ atomic_inc(&nn->nfs4_client_count);
+ atomic_set(&clp->cl_delegs_in_recall, 0);
INIT_LIST_HEAD(&clp->cl_idhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
@@ -2110,6 +2206,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp)
static void
__destroy_client(struct nfs4_client *clp)
{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
int i;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
@@ -2153,6 +2250,8 @@ __destroy_client(struct nfs4_client *clp)
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
+ atomic_add_unless(&nn->nfs4_client_count, -1, 0);
+ nfsd4_dec_courtesy_client_count(nn, clp);
free_client(clp);
wake_up_all(&expiry_wq);
}
@@ -2398,7 +2497,7 @@ static const char *cb_state2str(int state)
static int client_info_show(struct seq_file *m, void *v)
{
- struct inode *inode = m->private;
+ struct inode *inode = file_inode(m->file);
struct nfs4_client *clp;
u64 clid;
@@ -2408,10 +2507,17 @@ static int client_info_show(struct seq_file *m, void *v)
memcpy(&clid, &clp->cl_clientid, sizeof(clid));
seq_printf(m, "clientid: 0x%llx\n", clid);
seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr);
- if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+
+ if (clp->cl_state == NFSD4_COURTESY)
+ seq_puts(m, "status: courtesy\n");
+ else if (clp->cl_state == NFSD4_EXPIRABLE)
+ seq_puts(m, "status: expirable\n");
+ else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
seq_puts(m, "status: confirmed\n");
else
seq_puts(m, "status: unconfirmed\n");
+ seq_printf(m, "seconds from last renew: %lld\n",
+ ktime_get_boottime_seconds() - clp->cl_time);
seq_printf(m, "name: ");
seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
@@ -2431,17 +2537,7 @@ static int client_info_show(struct seq_file *m, void *v)
return 0;
}
-static int client_info_open(struct inode *inode, struct file *file)
-{
- return single_open(file, client_info_show, inode);
-}
-
-static const struct file_operations client_info_fops = {
- .open = client_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(client_info);
static void *states_start(struct seq_file *s, loff_t *pos)
__acquires(&clp->cl_lock)
@@ -2484,7 +2580,7 @@ static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f)
static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
{
- struct inode *inode = f->nf_inode;
+ struct inode *inode = file_inode(f->nf_file);
seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
MAJOR(inode->i_sb->s_dev),
@@ -2768,7 +2864,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct dentry *dentries[ARRAY_SIZE(client_files)];
- clp = alloc_client(name);
+ clp = alloc_client(name, nn);
if (clp == NULL)
return NULL;
@@ -4250,6 +4346,59 @@ out:
return -ENOMEM;
}
+static unsigned long
+nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cnt;
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_client_shrinker);
+
+ cnt = atomic_read(&nn->nfsd_courtesy_clients);
+ if (cnt > 0)
+ mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
+ return (unsigned long)cnt;
+}
+
+static unsigned long
+nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ return SHRINK_STOP;
+}
+
+int
+nfsd4_init_leases_net(struct nfsd_net *nn)
+{
+ struct sysinfo si;
+ u64 max_clients;
+
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
+ nn->somebody_reclaimed = false;
+ nn->track_reclaim_completes = false;
+ nn->clverifier_counter = prandom_u32();
+ nn->clientid_base = prandom_u32();
+ nn->clientid_counter = nn->clientid_base + 1;
+ nn->s2s_cp_cl_id = nn->clientid_counter++;
+
+ atomic_set(&nn->nfs4_client_count, 0);
+ si_meminfo(&si);
+ max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024);
+ max_clients *= NFS4_CLIENTS_PER_GB;
+ nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
+
+ atomic_set(&nn->nfsd_courtesy_clients, 0);
+ nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan;
+ nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count;
+ nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
+ return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
+}
+
+void
+nfsd4_leases_net_shutdown(struct nfsd_net *nn)
+{
+ unregister_shrinker(&nn->nfsd_client_shrinker);
+}
+
static void init_nfs4_replay(struct nfs4_replay *rp)
{
rp->rp_status = nfserr_serverfault;
@@ -4607,6 +4756,35 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
return ret;
}
+static bool nfsd4_deleg_present(const struct inode *inode)
+{
+ struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx);
+
+ return ctx && !list_empty_careful(&ctx->flc_lease);
+}
+
+/**
+ * nfsd_wait_for_delegreturn - wait for delegations to be returned
+ * @rqstp: the RPC transaction being executed
+ * @inode: in-core inode of the file being waited for
+ *
+ * The timeout prevents deadlock if all nfsd threads happen to be
+ * tied up waiting for returning delegations.
+ *
+ * Return values:
+ * %true: delegation was returned
+ * %false: timed out waiting for delegreturn
+ */
+bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode)
+{
+ long __maybe_unused timeo;
+
+ timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode),
+ NFSD_DELEGRETURN_TIMEOUT);
+ trace_nfsd_delegret_wakeup(rqstp, inode, timeo);
+ return timeo > 0;
+}
+
static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
{
struct nfs4_delegation *dp = cb_to_delegation(cb);
@@ -4635,6 +4813,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
{
struct nfs4_delegation *dp = cb_to_delegation(cb);
+ trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
+
if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
return 1;
@@ -4680,23 +4860,31 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
* callback (and since the lease code is serialized by the
- * i_lock) we know the server hasn't removed the lease yet, and
+ * flc_lock) we know the server hasn't removed the lease yet, and
* we know it's safe to take a reference.
*/
refcount_inc(&dp->dl_stid.sc_count);
- nfsd4_run_cb(&dp->dl_recall);
+ WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
}
-/* Called from break_lease() with i_lock held. */
+/* Called from break_lease() with flc_lock held. */
static bool
nfsd_break_deleg_cb(struct file_lock *fl)
{
- bool ret = false;
struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
struct nfs4_file *fp = dp->dl_stid.sc_file;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+ struct nfsd_net *nn;
trace_nfsd_cb_recall(&dp->dl_stid);
+ dp->dl_recalled = true;
+ atomic_inc(&clp->cl_delegs_in_recall);
+ if (try_to_expire_client(clp)) {
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ }
+
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a delegation isn't returned
@@ -4708,7 +4896,7 @@ nfsd_break_deleg_cb(struct file_lock *fl)
fp->fi_had_conflict = true;
nfsd_break_one_deleg(dp);
spin_unlock(&fp->fi_lock);
- return ret;
+ return false;
}
/**
@@ -4739,9 +4927,14 @@ static int
nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
struct list_head *dispose)
{
- if (arg & F_UNLCK)
+ struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+ if (arg & F_UNLCK) {
+ if (dp->dl_recalled)
+ atomic_dec(&clp->cl_delegs_in_recall);
return lease_modify(onlist, arg, dispose);
- else
+ } else
return -EAGAIN;
}
@@ -4938,16 +5131,19 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
.ia_valid = ATTR_SIZE,
.ia_size = 0,
};
+ struct nfsd_attrs attrs = {
+ .na_iattr = &iattr,
+ };
if (!open->op_truncate)
return 0;
if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
return nfserr_inval;
- return nfsd_setattr(rqstp, fh, &iattr, 0, (time64_t)0);
+ return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0);
}
static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
- struct nfsd4_open *open)
+ struct nfsd4_open *open, bool new_stp)
{
struct nfsd_file *nf = NULL;
__be32 status;
@@ -4963,6 +5159,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
*/
status = nfs4_file_check_deny(fp, open->op_share_deny);
if (status != nfs_ok) {
+ if (status != nfserr_share_denied) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
+ stp, open->op_share_deny, false))
+ status = nfserr_jukebox;
spin_unlock(&fp->fi_lock);
goto out;
}
@@ -4970,6 +5173,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
/* set access to the file */
status = nfs4_file_get_access(fp, open->op_share_access);
if (status != nfs_ok) {
+ if (status != nfserr_share_denied) {
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
+ stp, open->op_share_access, true))
+ status = nfserr_jukebox;
spin_unlock(&fp->fi_lock);
goto out;
}
@@ -4985,9 +5195,20 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
if (!fp->fi_fds[oflag]) {
spin_unlock(&fp->fi_lock);
- status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
- if (status)
- goto out_put_access;
+
+ if (!open->op_filp) {
+ status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
+ } else {
+ status = nfsd_file_create(rqstp, cur_fh, access, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
+ nf->nf_file = open->op_filp;
+ open->op_filp = NULL;
+ trace_nfsd_file_create(rqstp, access, nf);
+ }
+
spin_lock(&fp->fi_lock);
if (!fp->fi_fds[oflag]) {
fp->fi_fds[oflag] = nf;
@@ -5016,21 +5237,29 @@ out_put_access:
}
static __be32
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+ struct nfsd4_open *open)
{
__be32 status;
unsigned char old_deny_bmap = stp->st_deny_bmap;
if (!test_access(open->op_share_access, stp))
- return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
+ return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false);
/* test and set deny mode */
spin_lock(&fp->fi_lock);
status = nfs4_file_check_deny(fp, open->op_share_deny);
if (status == nfs_ok) {
- set_deny(open->op_share_deny, stp);
- fp->fi_share_deny |=
+ if (status != nfserr_share_denied) {
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |=
(open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+ } else {
+ if (nfs4_resolve_deny_conflicts_locked(fp, false,
+ stp, open->op_share_deny, false))
+ status = nfserr_jukebox;
+ }
}
spin_unlock(&fp->fi_lock);
@@ -5133,11 +5362,41 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
return 0;
}
+/*
+ * It's possible that between opening the dentry and setting the delegation,
+ * that it has been renamed or unlinked. Redo the lookup to verify that this
+ * hasn't happened.
+ */
+static int
+nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp,
+ struct svc_fh *parent)
+{
+ struct svc_export *exp;
+ struct dentry *child;
+ __be32 err;
+
+ err = nfsd_lookup_dentry(open->op_rqstp, parent,
+ open->op_fname, open->op_fnamelen,
+ &exp, &child);
+
+ if (err)
+ return -EAGAIN;
+
+ dput(child);
+ if (child != file_dentry(fp->fi_deleg_file->nf_file))
+ return -EAGAIN;
+
+ return 0;
+}
+
static struct nfs4_delegation *
-nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
- struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
+nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
+ struct svc_fh *parent)
{
int status = 0;
+ struct nfs4_client *clp = stp->st_stid.sc_client;
+ struct nfs4_file *fp = stp->st_stid.sc_file;
+ struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
struct nfs4_delegation *dp;
struct nfsd_file *nf;
struct file_lock *fl;
@@ -5179,7 +5438,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
return ERR_PTR(status);
status = -ENOMEM;
- dp = alloc_init_deleg(clp, fp, fh, odstate);
+ dp = alloc_init_deleg(clp, fp, odstate);
if (!dp)
goto out_delegees;
@@ -5192,6 +5451,13 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
locks_free_lock(fl);
if (status)
goto out_clnt_odstate;
+
+ if (parent) {
+ status = nfsd4_verify_deleg_dentry(open, fp, parent);
+ if (status)
+ goto out_unlock;
+ }
+
status = nfsd4_check_conflicting_opens(clp, fp);
if (status)
goto out_unlock;
@@ -5247,12 +5513,13 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
* proper support for them.
*/
static void
-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
- struct nfs4_ol_stateid *stp)
+nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
+ struct svc_fh *currentfh)
{
struct nfs4_delegation *dp;
struct nfs4_openowner *oo = openowner(stp->st_stateowner);
struct nfs4_client *clp = stp->st_stid.sc_client;
+ struct svc_fh *parent = NULL;
int cb_up;
int status = 0;
@@ -5266,6 +5533,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
goto out_no_deleg;
break;
case NFS4_OPEN_CLAIM_NULL:
+ parent = currentfh;
+ fallthrough;
case NFS4_OPEN_CLAIM_FH:
/*
* Let's not give out any delegations till everyone's
@@ -5280,7 +5549,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
default:
goto out_no_deleg;
}
- dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate);
+ dp = nfs4_set_delegation(open, stp, parent);
if (IS_ERR(dp))
goto out_no_deleg;
@@ -5322,6 +5591,18 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
*/
}
+/**
+ * nfsd4_process_open2 - finish open processing
+ * @rqstp: the RPC transaction being executed
+ * @current_fh: NFSv4 COMPOUND's current filehandle
+ * @open: OPEN arguments
+ *
+ * If successful, (1) truncate the file if open->op_truncate was
+ * set, (2) set open->op_stateid, (3) set open->op_delegation.
+ *
+ * Returns %nfs_ok on success; otherwise an nfs4stat value in
+ * network byte order is returned.
+ */
__be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
@@ -5371,7 +5652,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
goto out;
}
} else {
- status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
if (status) {
stp->st_stid.sc_type = NFS4_CLOSED_STID;
release_open_stateid(stp);
@@ -5400,7 +5681,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* Attempt to hand out a delegation. No error return, because the
* OPEN succeeds even if we fail.
*/
- nfs4_open_delegation(current_fh, open, stp);
+ nfs4_open_delegation(open, stp, &resp->cstate.current_fh);
nodeleg:
status = nfs_ok;
trace_nfsd_open(&stp->st_stid.sc_stateid);
@@ -5605,10 +5886,131 @@ static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
}
#endif
+/* Check if any lock belonging to this lockowner has any blockers */
+static bool
+nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
+{
+ struct file_lock_context *ctx;
+ struct nfs4_ol_stateid *stp;
+ struct nfs4_file *nf;
+
+ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+ nf = stp->st_stid.sc_file;
+ ctx = nf->fi_inode->i_flctx;
+ if (!ctx)
+ continue;
+ if (locks_owner_has_blockers(ctx, lo))
+ return true;
+ }
+ return false;
+}
+
+static bool
+nfs4_anylock_blockers(struct nfs4_client *clp)
+{
+ int i;
+ struct nfs4_stateowner *so;
+ struct nfs4_lockowner *lo;
+
+ if (atomic_read(&clp->cl_delegs_in_recall))
+ return true;
+ spin_lock(&clp->cl_lock);
+ for (i = 0; i < OWNER_HASH_SIZE; i++) {
+ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i],
+ so_strhash) {
+ if (so->so_is_open_owner)
+ continue;
+ lo = lockowner(so);
+ if (nfs4_lockowner_has_blockers(lo)) {
+ spin_unlock(&clp->cl_lock);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&clp->cl_lock);
+ return false;
+}
+
+static void
+nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
+ struct laundry_time *lt)
+{
+ unsigned int maxreap, reapcnt = 0;
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+
+ maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
+ NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
+ INIT_LIST_HEAD(reaplist);
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_state == NFSD4_EXPIRABLE)
+ goto exp_client;
+ if (!state_expired(lt, clp->cl_time))
+ break;
+ if (!atomic_read(&clp->cl_rpc_users)) {
+ if (clp->cl_state == NFSD4_ACTIVE)
+ atomic_inc(&nn->nfsd_courtesy_clients);
+ clp->cl_state = NFSD4_COURTESY;
+ }
+ if (!client_has_state(clp))
+ goto exp_client;
+ if (!nfs4_anylock_blockers(clp))
+ if (reapcnt >= maxreap)
+ continue;
+exp_client:
+ if (!mark_client_expired_locked(clp)) {
+ list_add(&clp->cl_lru, reaplist);
+ reapcnt++;
+ }
+ }
+ spin_unlock(&nn->client_lock);
+}
+
+static void
+nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn,
+ struct list_head *reaplist)
+{
+ unsigned int maxreap = 0, reapcnt = 0;
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+
+ maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
+ INIT_LIST_HEAD(reaplist);
+
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_state == NFSD4_ACTIVE)
+ break;
+ if (reapcnt >= maxreap)
+ break;
+ if (!mark_client_expired_locked(clp)) {
+ list_add(&clp->cl_lru, reaplist);
+ reapcnt++;
+ }
+ }
+ spin_unlock(&nn->client_lock);
+}
+
+static void
+nfs4_process_client_reaplist(struct list_head *reaplist)
+{
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+
+ list_for_each_safe(pos, next, reaplist) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ trace_nfsd_clid_purged(&clp->cl_clientid);
+ list_del_init(&clp->cl_lru);
+ expire_client(clp);
+ }
+}
+
static time64_t
nfs4_laundromat(struct nfsd_net *nn)
{
- struct nfs4_client *clp;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct nfs4_ol_stateid *stp;
@@ -5627,33 +6029,18 @@ nfs4_laundromat(struct nfsd_net *nn)
goto out;
}
nfsd4_end_grace(nn);
- INIT_LIST_HEAD(&reaplist);
spin_lock(&nn->s2s_cp_lock);
idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
- if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
+ if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID &&
state_expired(&lt, cps->cpntf_time))
_free_cpntf_state_locked(nn, cps);
}
spin_unlock(&nn->s2s_cp_lock);
+ nfs4_get_client_reaplist(nn, &reaplist, &lt);
+ nfs4_process_client_reaplist(&reaplist);
- spin_lock(&nn->client_lock);
- list_for_each_safe(pos, next, &nn->client_lru) {
- clp = list_entry(pos, struct nfs4_client, cl_lru);
- if (!state_expired(&lt, clp->cl_time))
- break;
- if (mark_client_expired_locked(clp))
- continue;
- list_add(&clp->cl_lru, &reaplist);
- }
- spin_unlock(&nn->client_lock);
- list_for_each_safe(pos, next, &reaplist) {
- clp = list_entry(pos, struct nfs4_client, cl_lru);
- trace_nfsd_clid_purged(&clp->cl_clientid);
- list_del_init(&clp->cl_lru);
- expire_client(clp);
- }
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -5722,7 +6109,6 @@ out:
return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
}
-static struct workqueue_struct *laundry_wq;
static void laundromat_main(struct work_struct *);
static void
@@ -5737,6 +6123,18 @@ laundromat_main(struct work_struct *laundry)
queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
}
+static void
+courtesy_client_reaper(struct work_struct *reaper)
+{
+ struct list_head reaplist;
+ struct delayed_work *dwork = to_delayed_work(reaper);
+ struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+ nfsd_shrinker_work);
+
+ nfs4_get_courtesy_client_reaplist(nn, &reaplist);
+ nfs4_process_client_reaplist(&reaplist);
+}
+
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
{
if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
@@ -5872,6 +6270,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid **s, struct nfsd_net *nn)
{
__be32 status;
+ struct nfs4_stid *stid;
bool return_revoked = false;
/*
@@ -5894,15 +6293,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
}
if (status)
return status;
- *s = find_stateid_by_type(cstate->clp, stateid, typemask);
- if (!*s)
+ stid = find_stateid_by_type(cstate->clp, stateid, typemask);
+ if (!stid)
return nfserr_bad_stateid;
- if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
- nfs4_put_stid(*s);
+ if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+ nfs4_put_stid(stid);
if (cstate->minorversion)
return nfserr_deleg_revoked;
return nfserr_bad_stateid;
}
+ *s = stid;
return nfs_ok;
}
@@ -5967,12 +6367,12 @@ out:
static void
_free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
{
- WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID);
- if (!refcount_dec_and_test(&cps->cp_stateid.sc_count))
+ WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID);
+ if (!refcount_dec_and_test(&cps->cp_stateid.cs_count))
return;
list_del(&cps->cp_list);
idr_remove(&nn->s2s_cp_stateids,
- cps->cp_stateid.stid.si_opaque.so_id);
+ cps->cp_stateid.cs_stid.si_opaque.so_id);
kfree(cps);
}
/*
@@ -5994,12 +6394,12 @@ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st,
if (cps_t) {
state = container_of(cps_t, struct nfs4_cpntf_state,
cp_stateid);
- if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) {
+ if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) {
state = NULL;
goto unlock;
}
if (!clp)
- refcount_inc(&state->cp_stateid.sc_count);
+ refcount_inc(&state->cp_stateid.cs_count);
else
_free_cpntf_state_locked(nn, state);
}
@@ -6407,6 +6807,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
struct nfs4_client *clp = s->st_stid.sc_client;
bool unhashed;
LIST_HEAD(reaplist);
+ struct nfs4_ol_stateid *stp;
spin_lock(&clp->cl_lock);
unhashed = unhash_open_stateid(s, &reaplist);
@@ -6415,6 +6816,8 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
if (unhashed)
put_ol_stateid_locked(s, &reaplist);
spin_unlock(&clp->cl_lock);
+ list_for_each_entry(stp, &reaplist, st_locks)
+ nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
free_ol_stateid_reaplist(&reaplist);
} else {
spin_unlock(&clp->cl_lock);
@@ -6498,6 +6901,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto put_stateid;
+ wake_up_var(d_inode(cstate->current_fh.fh_dentry));
destroy_delegation(dp);
put_stateid:
nfs4_put_stid(&dp->dl_stid);
@@ -6551,6 +6955,29 @@ nfsd4_lm_put_owner(fl_owner_t owner)
nfs4_put_stateowner(&lo->lo_owner);
}
+/* return pointer to struct nfs4_client if client is expirable */
+static bool
+nfsd4_lm_lock_expirable(struct file_lock *cfl)
+{
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner;
+ struct nfs4_client *clp = lo->lo_owner.so_client;
+ struct nfsd_net *nn;
+
+ if (try_to_expire_client(clp)) {
+ nn = net_generic(clp->net, nfsd_net_id);
+ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+ return true;
+ }
+ return false;
+}
+
+/* schedule laundromat to run immediately and wait for it to complete */
+static void
+nfsd4_lm_expire_lock(void)
+{
+ flush_workqueue(laundry_wq);
+}
+
static void
nfsd4_lm_notify(struct file_lock *fl)
{
@@ -6577,9 +7004,12 @@ nfsd4_lm_notify(struct file_lock *fl)
}
static const struct lock_manager_operations nfsd_posix_mng_ops = {
+ .lm_mod_owner = THIS_MODULE,
.lm_notify = nfsd4_lm_notify,
.lm_get_owner = nfsd4_lm_get_owner,
.lm_put_owner = nfsd4_lm_put_owner,
+ .lm_lock_expirable = nfsd4_lm_lock_expirable,
+ .lm_expire_lock = nfsd4_lm_expire_lock,
};
static inline void
@@ -7094,21 +7524,22 @@ out:
static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
{
struct nfsd_file *nf;
+ struct inode *inode;
__be32 err;
err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
if (err)
return err;
- fh_lock(fhp); /* to block new leases till after test_lock: */
- err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode,
- NFSD_MAY_READ));
+ inode = fhp->fh_dentry->d_inode;
+ inode_lock(inode); /* to block new leases till after test_lock: */
+ err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
if (err)
goto out;
lock->fl_file = nf->nf_file;
err = nfserrno(vfs_test_lock(nf->nf_file, lock));
lock->fl_file = NULL;
out:
- fh_unlock(fhp);
+ inode_unlock(inode);
nfsd_file_put(nf);
return err;
}
@@ -7297,22 +7728,36 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
return status;
}
+/**
+ * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations
+ * @rqstp: RPC transaction
+ * @cstate: NFSv4 COMPOUND state
+ * @u: RELEASE_LOCKOWNER arguments
+ *
+ * The lockowner's so_count is bumped when a lock record is added
+ * or when copying a conflicting lock. The latter case is brief,
+ * but can lead to fleeting false positives when looking for
+ * locks-in-use.
+ *
+ * Return values:
+ * %nfs_ok: lockowner released or not found
+ * %nfserr_locks_held: lockowner still in use
+ * %nfserr_stale_clientid: clientid no longer active
+ * %nfserr_expired: clientid not recognized
+ */
__be32
nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
clientid_t *clid = &rlockowner->rl_clientid;
- struct nfs4_stateowner *sop;
- struct nfs4_lockowner *lo = NULL;
struct nfs4_ol_stateid *stp;
- struct xdr_netobj *owner = &rlockowner->rl_owner;
- unsigned int hashval = ownerstr_hashval(owner);
- __be32 status;
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfs4_lockowner *lo;
struct nfs4_client *clp;
- LIST_HEAD (reaplist);
+ LIST_HEAD(reaplist);
+ __be32 status;
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
@@ -7320,34 +7765,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
status = set_client(clid, cstate, nn);
if (status)
return status;
-
clp = cstate->clp;
- /* Find the matching lock stateowner */
- spin_lock(&clp->cl_lock);
- list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
- so_strhash) {
-
- if (sop->so_is_open_owner || !same_owner_str(sop, owner))
- continue;
- /* see if there are still any locks associated with it */
- lo = lockowner(sop);
- list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
- if (check_for_locks(stp->st_stid.sc_file, lo)) {
- status = nfserr_locks_held;
- spin_unlock(&clp->cl_lock);
- return status;
- }
- }
-
- nfs4_get_stateowner(sop);
- break;
- }
+ spin_lock(&clp->cl_lock);
+ lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner);
if (!lo) {
spin_unlock(&clp->cl_lock);
- return status;
+ return nfs_ok;
+ }
+ if (atomic_read(&lo->lo_owner.so_count) != 2) {
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
}
-
unhash_lockowner_locked(lo);
while (!list_empty(&lo->lo_owner.so_stateids)) {
stp = list_first_entry(&lo->lo_owner.so_stateids,
@@ -7357,11 +7787,11 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
+
free_ol_stateid_reaplist(&reaplist);
remove_blocked_locks(lo);
nfs4_put_stateowner(&lo->lo_owner);
-
- return status;
+ return nfs_ok;
}
static inline struct nfs4_client_reclaim *
@@ -7527,6 +7957,7 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->blocked_locks_lru);
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+ INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, courtesy_client_reaper);
get_net(net);
return 0;
@@ -7602,22 +8033,12 @@ nfs4_state_start(void)
{
int ret;
- laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
- if (laundry_wq == NULL) {
- ret = -ENOMEM;
- goto out;
- }
ret = nfsd4_create_callback_queue();
if (ret)
- goto out_free_laundry;
+ return ret;
set_max_delegations();
return 0;
-
-out_free_laundry:
- destroy_workqueue(laundry_wq);
-out:
- return ret;
}
void
@@ -7654,7 +8075,6 @@ nfs4_state_shutdown_net(struct net *net)
void
nfs4_state_shutdown(void)
{
- destroy_workqueue(laundry_wq);
nfsd4_destroy_callback_queue();
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index da92e7d2ab6a..bcfeb1a922c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -42,6 +42,8 @@
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/addr.h>
#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+
#include <uapi/linux/xattr.h>
#include "idmap.h"
@@ -470,6 +472,15 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
return nfserr_bad_xdr;
}
}
+ if (bmval[1] & FATTR4_WORD1_TIME_CREATE) {
+ struct timespec64 ts;
+
+ /* No Linux filesystem supports setting this attribute. */
+ bmval[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ status = nfsd4_decode_nfstime4(argp, &ts);
+ if (status)
+ return status;
+ }
if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
u32 set_it;
@@ -782,6 +793,7 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
return nfserr_bad_xdr;
if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
return nfserr_bad_xdr;
+ memset(&commit->co_verf, 0, sizeof(commit->co_verf));
return nfs_ok;
}
@@ -790,6 +802,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
{
__be32 *p, status;
+ memset(create, 0, sizeof(*create));
if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0)
return nfserr_bad_xdr;
switch (create->cr_type) {
@@ -839,6 +852,7 @@ nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegretu
static inline __be32
nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
{
+ memset(getattr, 0, sizeof(*getattr));
return nfsd4_decode_bitmap4(argp, getattr->ga_bmval,
ARRAY_SIZE(getattr->ga_bmval));
}
@@ -846,6 +860,7 @@ nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *geta
static __be32
nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
{
+ memset(link, 0, sizeof(*link));
return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
}
@@ -894,6 +909,7 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
static __be32
nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
{
+ memset(lock, 0, sizeof(*lock));
if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0)
return nfserr_bad_xdr;
if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
@@ -910,6 +926,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
static __be32
nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
{
+ memset(lockt, 0, sizeof(*lockt));
if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0)
return nfserr_bad_xdr;
if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
@@ -1131,11 +1148,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
__be32 status;
u32 dummy;
- memset(open->op_bmval, 0, sizeof(open->op_bmval));
- open->op_iattr.ia_valid = 0;
- open->op_openowner = NULL;
+ memset(open, 0, sizeof(*open));
- open->op_xdr_error = 0;
if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0)
return nfserr_bad_xdr;
/* deleg_want is ignored */
@@ -1170,6 +1184,8 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0)
return nfserr_bad_xdr;
+ memset(&open_conf->oc_resp_stateid, 0,
+ sizeof(open_conf->oc_resp_stateid));
return nfs_ok;
}
@@ -1178,6 +1194,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
{
__be32 status;
+ memset(open_down, 0, sizeof(*open_down));
status = nfsd4_decode_stateid4(argp, &open_down->od_stateid);
if (status)
return status;
@@ -1207,6 +1224,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
if (!putfh->pf_fhval)
return nfserr_jukebox;
+ putfh->no_verify = false;
return nfs_ok;
}
@@ -1223,6 +1241,7 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
__be32 status;
+ memset(read, 0, sizeof(*read));
status = nfsd4_decode_stateid4(argp, &read->rd_stateid);
if (status)
return status;
@@ -1239,6 +1258,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
{
__be32 status;
+ memset(readdir, 0, sizeof(*readdir));
if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0)
return nfserr_bad_xdr;
status = nfsd4_decode_verifier4(argp, &readdir->rd_verf);
@@ -1258,6 +1278,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
static __be32
nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
{
+ memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo));
return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen);
}
@@ -1266,6 +1287,7 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
{
__be32 status;
+ memset(rename, 0, sizeof(*rename));
status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen);
if (status)
return status;
@@ -1282,6 +1304,7 @@ static __be32
nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
struct nfsd4_secinfo *secinfo)
{
+ secinfo->si_exp = NULL;
return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen);
}
@@ -1290,6 +1313,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
{
__be32 status;
+ memset(setattr, 0, sizeof(*setattr));
status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid);
if (status)
return status;
@@ -1304,6 +1328,8 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
{
__be32 *p, status;
+ memset(setclientid, 0, sizeof(*setclientid));
+
if (argp->minorversion >= 1)
return nfserr_notsupp;
@@ -1360,6 +1386,8 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
{
__be32 *p, status;
+ memset(verify, 0, sizeof(*verify));
+
status = nfsd4_decode_bitmap4(argp, verify->ve_bmval,
ARRAY_SIZE(verify->ve_bmval));
if (status)
@@ -1399,6 +1427,9 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen))
return nfserr_bad_xdr;
+ write->wr_bytes_written = 0;
+ write->wr_how_written = 0;
+ memset(&write->wr_verifier, 0, sizeof(write->wr_verifier));
return nfs_ok;
}
@@ -1423,6 +1454,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
{
+ memset(bc, 0, sizeof(*bc));
if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0)
return nfserr_bad_xdr;
return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
@@ -1433,6 +1465,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
u32 use_conn_in_rdma_mode;
__be32 status;
+ memset(bcts, 0, sizeof(*bcts));
status = nfsd4_decode_sessionid4(argp, &bcts->sessionid);
if (status)
return status;
@@ -1574,6 +1607,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
{
__be32 status;
+ memset(exid, 0, sizeof(*exid));
status = nfsd4_decode_verifier4(argp, &exid->verifier);
if (status)
return status;
@@ -1626,6 +1660,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
{
__be32 status;
+ memset(sess, 0, sizeof(*sess));
status = nfsd4_decode_clientid4(argp, &sess->clientid);
if (status)
return status;
@@ -1641,11 +1676,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
return status;
if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0)
return nfserr_bad_xdr;
- status = nfsd4_decode_cb_sec(argp, &sess->cb_sec);
- if (status)
- return status;
-
- return nfs_ok;
+ return nfsd4_decode_cb_sec(argp, &sess->cb_sec);
}
static __be32
@@ -1669,6 +1700,7 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
{
__be32 status;
+ memset(gdev, 0, sizeof(*gdev));
status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid);
if (status)
return status;
@@ -1689,6 +1721,7 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
{
__be32 *p, status;
+ memset(lcp, 0, sizeof(*lcp));
if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0)
@@ -1724,6 +1757,7 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
{
__be32 status;
+ memset(lgp, 0, sizeof(*lgp));
if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0)
@@ -1749,6 +1783,7 @@ static __be32
nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
struct nfsd4_layoutreturn *lrp)
{
+ memset(lrp, 0, sizeof(*lrp));
if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0)
@@ -1764,6 +1799,8 @@ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
{
if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0)
return nfserr_bad_xdr;
+
+ sin->sin_exp = NULL;
return nfs_ok;
}
@@ -1784,6 +1821,7 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
seq->maxslots = be32_to_cpup(p++);
seq->cachethis = be32_to_cpup(p);
+ seq->status_flags = 0;
return nfs_ok;
}
@@ -1794,6 +1832,7 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
__be32 status;
u32 i;
+ memset(test_stateid, 0, sizeof(*test_stateid));
if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0)
return nfserr_bad_xdr;
@@ -1801,7 +1840,7 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
for (i = 0; i < test_stateid->ts_num_ids; i++) {
stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
if (!stateid)
- return nfserrno(-ENOMEM); /* XXX: not jukebox? */
+ return nfserr_jukebox;
INIT_LIST_HEAD(&stateid->ts_id_list);
list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
status = nfsd4_decode_stateid4(argp, &stateid->ts_id_stateid);
@@ -1887,10 +1926,11 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
{
+ u32 consecutive, i, count, sync;
struct nl4_server *ns_dummy;
- u32 consecutive, i, count;
__be32 status;
+ memset(copy, 0, sizeof(*copy));
status = nfsd4_decode_stateid4(argp, &copy->cp_src_stateid);
if (status)
return status;
@@ -1906,25 +1946,28 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
/* ca_consecutive: we always do consecutive copies */
if (xdr_stream_decode_u32(argp->xdr, &consecutive) < 0)
return nfserr_bad_xdr;
- if (xdr_stream_decode_u32(argp->xdr, &copy->cp_synchronous) < 0)
+ if (xdr_stream_decode_bool(argp->xdr, &sync) < 0)
return nfserr_bad_xdr;
+ nfsd4_copy_set_sync(copy, sync);
if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
return nfserr_bad_xdr;
- copy->cp_intra = false;
+ copy->cp_src = svcxdr_tmpalloc(argp, sizeof(*copy->cp_src));
+ if (copy->cp_src == NULL)
+ return nfserr_jukebox;
if (count == 0) { /* intra-server copy */
- copy->cp_intra = true;
+ __set_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
return nfs_ok;
}
/* decode all the supplied server addresses but use only the first */
- status = nfsd4_decode_nl4_server(argp, &copy->cp_src);
+ status = nfsd4_decode_nl4_server(argp, copy->cp_src);
if (status)
return status;
ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL);
if (ns_dummy == NULL)
- return nfserrno(-ENOMEM); /* XXX: jukebox? */
+ return nfserr_jukebox;
for (i = 0; i < count - 1; i++) {
status = nfsd4_decode_nl4_server(argp, ns_dummy);
if (status) {
@@ -1943,16 +1986,26 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
{
__be32 status;
+ memset(cn, 0, sizeof(*cn));
+ cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src));
+ if (cn->cpn_src == NULL)
+ return nfserr_jukebox;
+ cn->cpn_dst = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_dst));
+ if (cn->cpn_dst == NULL)
+ return nfserr_jukebox;
+
status = nfsd4_decode_stateid4(argp, &cn->cpn_src_stateid);
if (status)
return status;
- return nfsd4_decode_nl4_server(argp, &cn->cpn_dst);
+ return nfsd4_decode_nl4_server(argp, cn->cpn_dst);
}
static __be32
nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
struct nfsd4_offload_status *os)
{
+ os->count = 0;
+ os->status = 0;
return nfsd4_decode_stateid4(argp, &os->stateid);
}
@@ -1969,6 +2022,8 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0)
return nfserr_bad_xdr;
+ seek->seek_eof = 0;
+ seek->seek_pos = 0;
return nfs_ok;
}
@@ -2104,6 +2159,7 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
__be32 status;
u32 maxcount;
+ memset(getxattr, 0, sizeof(*getxattr));
status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name);
if (status)
return status;
@@ -2112,8 +2168,7 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
getxattr->getxa_len = maxcount;
-
- return status;
+ return nfs_ok;
}
static __be32
@@ -2123,6 +2178,8 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
u32 flags, maxcount, size;
__be32 status;
+ memset(setxattr, 0, sizeof(*setxattr));
+
if (xdr_stream_decode_u32(argp->xdr, &flags) < 0)
return nfserr_bad_xdr;
@@ -2161,6 +2218,8 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
{
u32 maxcount;
+ memset(listxattrs, 0, sizeof(*listxattrs));
+
if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0)
return nfserr_bad_xdr;
@@ -2188,6 +2247,7 @@ static __be32
nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp,
struct nfsd4_removexattr *removexattr)
{
+ memset(removexattr, 0, sizeof(*removexattr));
return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name);
}
@@ -2338,22 +2398,15 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
return false;
- if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
return false;
-
- /*
- * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
- * here, so we return success at the xdr level so that
- * nfsd4_proc can handle this is an NFS-level error.
- */
- if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
- return true;
+ argp->opcnt = min_t(u32, argp->client_opcnt,
+ NFSD_MAX_OPS_PER_COMPOUND);
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
- argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+ argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
if (!argp->ops) {
argp->ops = argp->iops;
- dprintk("nfsd: couldn't allocate room for COMPOUND\n");
return false;
}
}
@@ -2411,7 +2464,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
- clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+ __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
return true;
}
@@ -2755,9 +2808,10 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32
}
-static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
{
struct path path = exp->ex_path;
+ struct kstat stat;
int err;
path_get(&path);
@@ -2765,8 +2819,10 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
if (path.dentry != path.mnt->mnt_root)
break;
}
- err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+ err = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
path_put(&path);
+ if (!err)
+ *pino = stat.ino;
return err;
}
@@ -2819,10 +2875,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
struct kstat stat;
struct svc_fh *tempfh = NULL;
struct kstatfs statfs;
- __be32 *p;
+ __be32 *p, *attrlen_p;
int starting_len = xdr->buf->len;
int attrlen_offset;
- __be32 attrlen;
u32 dummy;
u64 dummy64;
u32 rdattr_err = 0;
@@ -2910,10 +2965,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
goto out;
attrlen_offset = xdr->buf->len;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!attrlen_p)
goto out_resource;
- p++; /* to be backfilled later */
if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
u32 supp[3];
@@ -3265,22 +3319,21 @@ out_acl:
*p++ = cpu_to_be32(stat.btime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
- struct kstat parent_stat;
u64 ino = stat.ino;
p = xdr_reserve_space(xdr, 8);
if (!p)
goto out_resource;
/*
- * Get parent's attributes if not ignoring crossmount
- * and this is the root of a cross-mounted filesystem.
+ * Get ino of mountpoint in parent filesystem, if not ignoring
+ * crossmount and this is the root of a cross-mounted
+ * filesystem.
*/
if (ignore_crossmnt == 0 &&
dentry == exp->ex_path.mnt->mnt_root) {
- err = get_parent_attributes(exp, &parent_stat);
+ err = nfsd4_get_mounted_on_ino(exp, &ino);
if (err)
goto out_nfserr;
- ino = parent_stat.ino;
}
p = xdr_encode_hyper(p, ino);
}
@@ -3335,8 +3388,7 @@ out_acl:
*p++ = cpu_to_be32(err == 0);
}
- attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
- write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
+ *attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
status = nfs_ok;
out:
@@ -3873,16 +3925,15 @@ static __be32 nfsd4_encode_splice_read(
struct xdr_stream *xdr = resp->xdr;
struct xdr_buf *buf = xdr->buf;
int status, space_left;
- u32 eof;
__be32 nfserr;
- __be32 *p = xdr->p - 2;
/* Make sure there will be room for padding if needed */
if (xdr->end - xdr->p < 1)
return nfserr_resource;
nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
- file, read->rd_offset, &maxcount, &eof);
+ file, read->rd_offset, &maxcount,
+ &read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
goto out_err;
@@ -3893,9 +3944,6 @@ static __be32 nfsd4_encode_splice_read(
goto out_err;
}
- *(p++) = htonl(eof);
- *(p++) = htonl(maxcount);
-
buf->page_len = maxcount;
buf->len += maxcount;
xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
@@ -3937,11 +3985,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct file *file, unsigned long maxcount)
{
struct xdr_stream *xdr = resp->xdr;
- u32 eof;
- int starting_len = xdr->buf->len - 8;
+ unsigned int starting_len = xdr->buf->len;
+ __be32 zero = xdr_zero;
__be32 nfserr;
- __be32 tmp;
- int pad;
read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
if (read->rd_vlen < 0)
@@ -3949,31 +3995,24 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
- &eof);
+ &read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
return nfserr;
- if (svc_encode_result_payload(resp->rqstp, starting_len + 8, maxcount))
+ if (svc_encode_result_payload(resp->rqstp, starting_len, maxcount))
return nfserr_io;
- xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount));
-
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
- tmp = htonl(maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
-
- tmp = xdr_zero;
- pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
- &tmp, pad);
- return 0;
+ xdr_truncate_encode(xdr, starting_len + xdr_align_size(maxcount));
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + maxcount, &zero,
+ xdr_pad_size(maxcount));
+ return nfs_ok;
}
static __be32
nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
+ bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
unsigned long maxcount;
struct xdr_stream *xdr = resp->xdr;
struct file *file;
@@ -3986,44 +4025,42 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
if (!p) {
- WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
+ WARN_ON_ONCE(splice_ok);
return nfserr_resource;
}
- if (resp->xdr->buf->page_len &&
- test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
+ if (resp->xdr->buf->page_len && splice_ok) {
WARN_ON_ONCE(1);
- return nfserr_resource;
+ return nfserr_serverfault;
}
xdr_commit_encode(xdr);
maxcount = min_t(unsigned long, read->rd_length,
(xdr->buf->buflen - xdr->buf->len));
- if (file->f_op->splice_read &&
- test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+ if (file->f_op->splice_read && splice_ok)
nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
-
- if (nfserr)
+ if (nfserr) {
xdr_truncate_encode(xdr, starting_len);
+ return nfserr;
+ }
- return nfserr;
+ p = xdr_encode_bool(p, read->rd_eof);
+ *p = cpu_to_be32(read->rd_length);
+ return nfs_ok;
}
static __be32
nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
{
- int maxcount;
- __be32 wire_count;
- int zero = 0;
+ __be32 *p, *maxcount_p, zero = xdr_zero;
struct xdr_stream *xdr = resp->xdr;
int length_offset = xdr->buf->len;
- int status;
- __be32 *p;
+ int maxcount, status;
- p = xdr_reserve_space(xdr, 4);
- if (!p)
+ maxcount_p = xdr_reserve_space(xdr, XDR_UNIT);
+ if (!maxcount_p)
return nfserr_resource;
maxcount = PAGE_SIZE;
@@ -4048,14 +4085,11 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
nfserr = nfserrno(status);
goto out_err;
}
-
- wire_count = htonl(maxcount);
- write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
- xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4));
- if (maxcount & 3)
- write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
- &zero, 4 - (maxcount&3));
- return 0;
+ *maxcount_p = cpu_to_be32(maxcount);
+ xdr_truncate_encode(xdr, length_offset + 4 + xdr_align_size(maxcount));
+ write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero,
+ xdr_pad_size(maxcount));
+ return nfs_ok;
out_err:
xdr_truncate_encode(xdr, length_offset);
@@ -4706,13 +4740,13 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
__be32 *p;
nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
- !!copy->cp_synchronous);
+ nfsd4_copy_is_sync(copy));
if (nfserr)
return nfserr;
p = xdr_reserve_space(resp->xdr, 4 + 4);
*p++ = xdr_one; /* cr_consecutive */
- *p++ = cpu_to_be32(copy->cp_synchronous);
+ *p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero;
return 0;
}
@@ -4910,7 +4944,8 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
*p++ = cpu_to_be32(1);
- return nfsd42_encode_nl4_server(resp, &cn->cpn_src);
+ nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src);
+ return nfserr;
}
static __be32
@@ -5364,8 +5399,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
so->so_replay.rp_buf, len);
}
status:
- /* Note that op->status is already in network byte order: */
- write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4);
+ *p = op->status;
}
/*
@@ -5396,7 +5430,7 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
struct nfsd4_compoundargs *args = rqstp->rq_argp;
if (args->ops != args->iops) {
- kfree(args->ops);
+ vfree(args->ops);
args->ops = args->iops;
}
while (args->to_free) {
@@ -5425,12 +5459,8 @@ bool
nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct xdr_buf *buf = xdr->buf;
__be32 *p;
- WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
- buf->tail[0].iov_len);
-
/*
* Send buffer space for the following items is reserved
* at the top of nfsd4_proc_compound().
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 0b3f12aa37ff..3e64a3d50a1c 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
nn->nfsd_reply_cache_shrinker.seeks = 1;
- status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
+ status = register_shrinker(&nn->nfsd_reply_cache_shrinker,
+ "nfsd-reply:%s", nn->nfsd_name);
if (status)
goto out_stats_destroy;
@@ -206,7 +207,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
struct svc_cacherep *rp;
unsigned int i;
- nfsd_reply_cache_stats_destroy(nn);
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
for (i = 0; i < nn->drc_hashsize; i++) {
@@ -217,6 +217,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
rp, nn);
}
}
+ nfsd_reply_cache_stats_destroy(nn);
kvfree(nn->drc_hashtbl);
nn->drc_hashtbl = NULL;
@@ -603,9 +604,10 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
* scraping this file for info should test the labels to ensure they're
* getting the correct field.
*/
-static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
{
- struct nfsd_net *nn = m->private;
+ struct nfsd_net *nn = net_generic(file_inode(m->file)->i_sb->s_fs_info,
+ nfsd_net_id);
seq_printf(m, "max entries: %u\n", nn->max_drc_entries);
seq_printf(m, "num entries: %u\n",
@@ -625,11 +627,3 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize);
return 0;
}
-
-int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
-{
- struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info,
- nfsd_net_id);
-
- return single_open(file, nfsd_reply_cache_stats_show, nn);
-}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 16920e4512bd..6a29bcfc9390 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -25,6 +25,7 @@
#include "state.h"
#include "netns.h"
#include "pnfs.h"
+#include "filecache.h"
/*
* We have a single directory with several nodes in it.
@@ -45,6 +46,7 @@ enum {
NFSD_Ports,
NFSD_MaxBlkSize,
NFSD_MaxConnections,
+ NFSD_Filecache,
NFSD_SupportedEnctypes,
/*
* The below MUST come last. Otherwise we leave a hole in nfsd_files[]
@@ -183,17 +185,7 @@ static int export_features_show(struct seq_file *m, void *v)
return 0;
}
-static int export_features_open(struct inode *inode, struct file *file)
-{
- return single_open(file, export_features_show, NULL);
-}
-
-static const struct file_operations export_features_operations = {
- .open = export_features_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(export_features);
#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
static int supported_enctypes_show(struct seq_file *m, void *v)
@@ -202,17 +194,7 @@ static int supported_enctypes_show(struct seq_file *m, void *v)
return 0;
}
-static int supported_enctypes_open(struct inode *inode, struct file *file)
-{
- return single_open(file, supported_enctypes_show, NULL);
-}
-
-static const struct file_operations supported_enctypes_ops = {
- .open = supported_enctypes_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(supported_enctypes);
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
static const struct file_operations pool_stats_operations = {
@@ -222,12 +204,9 @@ static const struct file_operations pool_stats_operations = {
.release = nfsd_pool_stats_release,
};
-static const struct file_operations reply_cache_stats_operations = {
- .open = nfsd_reply_cache_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats);
+
+DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats);
/*----------------------------------------------------------------------------*/
/*
@@ -633,7 +612,6 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
}
/* Now write current state into reply buffer */
- len = 0;
sep = "";
remaining = SIMPLE_TRANSACTION_LIMIT;
for (num=2 ; num <= 4 ; num++) {
@@ -1357,7 +1335,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
/* Per-export io stats use same ops as exports file */
[NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
- &export_features_operations, S_IRUGO},
+ &export_features_fops, S_IRUGO},
[NFSD_FO_UnlockIP] = {"unlock_ip",
&transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_FO_UnlockFS] = {"unlock_filesystem",
@@ -1366,13 +1344,16 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
- [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
+ [NFSD_Reply_Cache_Stats] = {"reply_cache_stats",
+ &nfsd_reply_cache_stats_fops, S_IRUGO},
[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
- [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+ [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes",
+ &supported_enctypes_fops, S_IRUGO},
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
@@ -1472,18 +1453,12 @@ static __net_init int nfsd_init_net(struct net *net)
goto out_idmap_error;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
+ retval = nfsd4_init_leases_net(nn);
+ if (retval)
+ goto out_drc_error;
retval = nfsd_reply_cache_init(nn);
if (retval)
goto out_drc_error;
- nn->nfsd4_lease = 90; /* default lease time */
- nn->nfsd4_grace = 90;
- nn->somebody_reclaimed = false;
- nn->track_reclaim_completes = false;
- nn->clverifier_counter = prandom_u32();
- nn->clientid_base = prandom_u32();
- nn->clientid_counter = nn->clientid_base + 1;
- nn->s2s_cp_cl_id = nn->clientid_counter++;
-
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
@@ -1505,6 +1480,7 @@ static __net_exit void nfsd_exit_net(struct net *net)
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+ nfsd4_leases_net_shutdown(nn);
}
static struct pernet_operations nfsd_net_ops = {
@@ -1517,7 +1493,6 @@ static struct pernet_operations nfsd_net_ops = {
static int __init init_nfsd(void)
{
int retval;
- printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
retval = nfsd4_init_slabs();
if (retval)
@@ -1535,20 +1510,25 @@ static int __init init_nfsd(void)
retval = create_proc_exports_entry();
if (retval)
goto out_free_lockd;
- retval = register_filesystem(&nfsd_fs_type);
- if (retval)
- goto out_free_exports;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_free_filesystem;
+ goto out_free_exports;
retval = register_cld_notifier();
if (retval)
+ goto out_free_subsys;
+ retval = nfsd4_create_laundry_wq();
+ if (retval)
+ goto out_free_cld;
+ retval = register_filesystem(&nfsd_fs_type);
+ if (retval)
goto out_free_all;
return 0;
out_free_all:
+ nfsd4_destroy_laundry_wq();
+out_free_cld:
+ unregister_cld_notifier();
+out_free_subsys:
unregister_pernet_subsys(&nfsd_net_ops);
-out_free_filesystem:
- unregister_filesystem(&nfsd_fs_type);
out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
@@ -1566,6 +1546,8 @@ out_free_slabs:
static void __exit exit_nfsd(void)
{
+ unregister_filesystem(&nfsd_fs_type);
+ nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
@@ -1575,7 +1557,6 @@ static void __exit exit_nfsd(void)
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();
- unregister_filesystem(&nfsd_fs_type);
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4fc1fd639527..09726c5b9a31 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -162,6 +162,9 @@ void nfs4_state_shutdown_net(struct net *net);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
+int nfsd4_create_laundry_wq(void);
+void nfsd4_destroy_laundry_wq(void);
+bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode);
#else
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
@@ -175,6 +178,13 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
{
return false;
}
+static inline int nfsd4_create_laundry_wq(void) { return 0; };
+static inline void nfsd4_destroy_laundry_wq(void) {};
+static inline bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp,
+ struct inode *inode)
+{
+ return false;
+}
#endif
/*
@@ -336,6 +346,10 @@ void nfsd_lockd_shutdown(void);
#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
+#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */
+#define NFSD_CLIENT_MAX_TRIM_PER_RUN 128
+#define NFS4_CLIENTS_PER_GB 1024
+#define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */
/*
* The following attributes are currently not supported by the NFSv4 server:
@@ -460,7 +474,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
#define NFSD_WRITEABLE_ATTRS_WORD1 \
(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \
+ | FATTR4_WORD1_TIME_MODIFY_SET)
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \
FATTR4_WORD2_SECURITY_LABEL
@@ -490,12 +505,18 @@ extern void unregister_cld_notifier(void);
extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
#endif
+extern int nfsd4_init_leases_net(struct nfsd_net *nn);
+extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn);
+
#else /* CONFIG_NFSD_V4 */
static inline int nfsd4_is_junction(struct dentry *dentry)
{
return 0;
}
+static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; };
+static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {};
+
#define register_cld_notifier() 0
#define unregister_cld_notifier() do { } while(0)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c29baa03dfaf..d73434200df9 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -331,8 +331,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
struct dentry *dentry;
__be32 error;
- dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
-
if (!fhp->fh_dentry) {
error = nfsd_set_fh_dentry(rqstp, fhp);
if (error)
@@ -340,6 +338,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
}
dentry = fhp->fh_dentry;
exp = fhp->fh_export;
+
+ trace_nfsd_fh_verify(rqstp, fhp, type, access);
+
/*
* We still have to do all these permission checks, even when
* fh_dentry is already set:
@@ -391,13 +392,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
skip_pseudoflavor_check:
/* Finally, check access permissions. */
error = nfsd_permission(rqstp, exp, dentry, access);
-
- if (error) {
- dprintk("fh_verify: %pd2 permission failure, "
- "acc=%x, error=%d\n",
- dentry,
- access, ntohl(error));
- }
+ trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
out:
if (error == nfserr_stale)
nfsd_stats_fh_stale_inc(exp);
@@ -548,7 +543,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
if (ref_fh == fhp)
fh_put(ref_fh);
- if (fhp->fh_locked || fhp->fh_dentry) {
+ if (fhp->fh_dentry) {
printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n",
dentry);
}
@@ -671,6 +666,25 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
}
+/**
+ * fh_fill_both_attrs - Fill pre-op and post-op attributes
+ * @fhp: file handle to be updated
+ *
+ * This is used when the directory wasn't changed, but wcc attributes
+ * are needed anyway.
+ */
+void fh_fill_both_attrs(struct svc_fh *fhp)
+{
+ fh_fill_post_attrs(fhp);
+ if (!fhp->fh_post_saved)
+ return;
+ fhp->fh_pre_change = fhp->fh_post_change;
+ fhp->fh_pre_mtime = fhp->fh_post_attr.mtime;
+ fhp->fh_pre_ctime = fhp->fh_post_attr.ctime;
+ fhp->fh_pre_size = fhp->fh_post_attr.size;
+ fhp->fh_pre_saved = true;
+}
+
/*
* Release a file handle.
*/
@@ -680,7 +694,6 @@ fh_put(struct svc_fh *fhp)
struct dentry * dentry = fhp->fh_dentry;
struct svc_export * exp = fhp->fh_export;
if (dentry) {
- fh_unlock(fhp);
fhp->fh_dentry = NULL;
dput(dentry);
fh_clear_pre_post_attrs(fhp);
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index fb9d358a267e..c3ae6414fc5c 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -81,7 +81,6 @@ typedef struct svc_fh {
struct dentry * fh_dentry; /* validated dentry */
struct svc_export * fh_export; /* export pointer */
- bool fh_locked; /* inode locked by us */
bool fh_want_write; /* remount protection taken */
bool fh_no_wcc; /* no wcc data needed */
bool fh_no_atomic_attr;
@@ -93,7 +92,7 @@ typedef struct svc_fh {
bool fh_post_saved; /* post-op attrs saved */
bool fh_pre_saved; /* pre-op attrs saved */
- /* Pre-op attributes saved during fh_lock */
+ /* Pre-op attributes saved when inode is locked */
__u64 fh_pre_size; /* size before operation */
struct timespec64 fh_pre_mtime; /* mtime before oper */
struct timespec64 fh_pre_ctime; /* ctime before oper */
@@ -103,7 +102,7 @@ typedef struct svc_fh {
*/
u64 fh_pre_change;
- /* Post-op attributes saved in fh_unlock */
+ /* Post-op attributes saved in fh_fill_post_attrs() */
struct kstat fh_post_attr; /* full attrs after operation */
u64 fh_post_change; /* nfsv4 change; see above */
} svc_fh;
@@ -223,8 +222,8 @@ void fh_put(struct svc_fh *);
static __inline__ struct svc_fh *
fh_copy(struct svc_fh *dst, struct svc_fh *src)
{
- WARN_ON(src->fh_dentry || src->fh_locked);
-
+ WARN_ON(src->fh_dentry);
+
*dst = *src;
return dst;
}
@@ -322,52 +321,5 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat,
extern void fh_fill_pre_attrs(struct svc_fh *fhp);
extern void fh_fill_post_attrs(struct svc_fh *fhp);
-
-
-/*
- * Lock a file handle/inode
- * NOTE: both fh_lock and fh_unlock are done "by hand" in
- * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
- * so, any changes here should be reflected there.
- */
-
-static inline void
-fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
-{
- struct dentry *dentry = fhp->fh_dentry;
- struct inode *inode;
-
- BUG_ON(!dentry);
-
- if (fhp->fh_locked) {
- printk(KERN_WARNING "fh_lock: %pd2 already locked!\n",
- dentry);
- return;
- }
-
- inode = d_inode(dentry);
- inode_lock_nested(inode, subclass);
- fh_fill_pre_attrs(fhp);
- fhp->fh_locked = true;
-}
-
-static inline void
-fh_lock(struct svc_fh *fhp)
-{
- fh_lock_nested(fhp, I_MUTEX_NORMAL);
-}
-
-/*
- * Unlock a file handle/inode
- */
-static inline void
-fh_unlock(struct svc_fh *fhp)
-{
- if (fhp->fh_locked) {
- fh_fill_post_attrs(fhp);
- inode_unlock(d_inode(fhp->fh_dentry));
- fhp->fh_locked = false;
- }
-}
-
+extern void fh_fill_both_attrs(struct svc_fh *fhp);
#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index fcdab8a8a41f..82b3ddeacc33 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -51,6 +51,9 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
struct nfsd_sattrargs *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
struct iattr *iap = &argp->attrs;
+ struct nfsd_attrs attrs = {
+ .na_iattr = iap,
+ };
struct svc_fh *fhp;
dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
@@ -100,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
}
}
- resp->status = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0);
+ resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0);
if (resp->status != nfs_ok)
goto out;
@@ -182,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
argp->count, argp->offset);
argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+ argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
v = 0;
len = argp->count;
@@ -260,6 +264,9 @@ nfsd_proc_create(struct svc_rqst *rqstp)
svc_fh *dirfhp = &argp->fh;
svc_fh *newfhp = &resp->fh;
struct iattr *attr = &argp->attrs;
+ struct nfsd_attrs attrs = {
+ .na_iattr = attr,
+ };
struct inode *inode;
struct dentry *dchild;
int type, mode;
@@ -285,7 +292,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
goto done;
}
- fh_lock_nested(dirfhp, I_MUTEX_PARENT);
+ inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT);
dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
if (IS_ERR(dchild)) {
resp->status = nfserrno(PTR_ERR(dchild));
@@ -384,9 +391,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
resp->status = nfs_ok;
if (!inode) {
/* File doesn't exist. Create it and set attrs */
- resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name,
- argp->len, attr, type, rdev,
- newfhp);
+ resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type,
+ rdev, newfhp);
} else if (type == S_IFREG) {
dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
argp->name, attr->ia_valid, (long) attr->ia_size);
@@ -396,13 +402,12 @@ nfsd_proc_create(struct svc_rqst *rqstp)
*/
attr->ia_valid &= ATTR_SIZE;
if (attr->ia_valid)
- resp->status = nfsd_setattr(rqstp, newfhp, attr, 0,
+ resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0,
(time64_t)0);
}
out_unlock:
- /* We don't really need to unlock, as fh_put does it. */
- fh_unlock(dirfhp);
+ inode_unlock(dirfhp->fh_dentry->d_inode);
fh_drop_write(dirfhp);
done:
fh_put(dirfhp);
@@ -472,6 +477,9 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
{
struct nfsd_symlinkargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
struct svc_fh newfh;
if (argp->tlen > NFS_MAXPATHLEN) {
@@ -493,7 +501,7 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
fh_init(&newfh, NFS_FHSIZE);
resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
- argp->tname, &newfh);
+ argp->tname, &attrs, &newfh);
kfree(argp->tname);
fh_put(&argp->ffh);
@@ -511,6 +519,9 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
{
struct nfsd_createargs *argp = rqstp->rq_argp;
struct nfsd_diropres *resp = rqstp->rq_resp;
+ struct nfsd_attrs attrs = {
+ .na_iattr = &argp->attrs,
+ };
dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -522,7 +533,7 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
argp->attrs.ia_valid &= ~ATTR_SIZE;
fh_init(&resp->fh, NFS_FHSIZE);
resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
- &argp->attrs, S_IFDIR, 0, &resp->fh);
+ &attrs, S_IFDIR, 0, &resp->fh);
fh_put(&argp->fh);
if (resp->status != nfs_ok)
goto out;
@@ -556,24 +567,15 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
struct xdr_buf *buf = &resp->dirlist;
struct xdr_stream *xdr = &resp->xdr;
- count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
-
memset(buf, 0, sizeof(*buf));
/* Reserve room for the NULL ptr & eof flag (-2 words) */
- buf->buflen = count - XDR_UNIT * 2;
+ buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), (u32)PAGE_SIZE);
+ buf->buflen -= XDR_UNIT * 2;
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page++;
- /* This is xdr_init_encode(), but it assumes that
- * the head kvec has already been consumed. */
- xdr_set_scratch_buffer(xdr, NULL, 0);
- xdr->buf = buf;
- xdr->page_ptr = buf->pages;
- xdr->iov = NULL;
- xdr->p = page_address(*buf->pages);
- xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
- xdr->rqst = NULL;
+ xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
}
/*
@@ -635,6 +637,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
@@ -646,6 +649,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
@@ -657,6 +661,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_sattrargs),
+ .pc_argzero = sizeof(struct nfsd_sattrargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
@@ -667,6 +672,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
@@ -678,6 +684,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_diropres,
.pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_argzero = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+FH+AT,
@@ -688,6 +695,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_readlinkres,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_readlinkres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
@@ -699,6 +707,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_readres,
.pc_release = nfssvc_release_readres,
.pc_argsize = sizeof(struct nfsd_readargs),
+ .pc_argzero = sizeof(struct nfsd_readargs),
.pc_ressize = sizeof(struct nfsd_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
@@ -709,6 +718,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_voidarg,
.pc_encode = nfssvc_encode_voidres,
.pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_argzero = sizeof(struct nfsd_voidargs),
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
@@ -720,6 +730,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_writeargs),
+ .pc_argzero = sizeof(struct nfsd_writeargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
@@ -731,6 +742,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_diropres,
.pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_createargs),
+ .pc_argzero = sizeof(struct nfsd_createargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
@@ -741,6 +753,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_diropargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_argzero = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -751,6 +764,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_renameargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_renameargs),
+ .pc_argzero = sizeof(struct nfsd_renameargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -761,6 +775,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_linkargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_linkargs),
+ .pc_argzero = sizeof(struct nfsd_linkargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -771,6 +786,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_symlinkargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_symlinkargs),
+ .pc_argzero = sizeof(struct nfsd_symlinkargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -782,6 +798,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_encode = nfssvc_encode_diropres,
.pc_release = nfssvc_release_diropres,
.pc_argsize = sizeof(struct nfsd_createargs),
+ .pc_argzero = sizeof(struct nfsd_createargs),
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
@@ -792,6 +809,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_diropargs,
.pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_diropargs),
+ .pc_argzero = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
@@ -802,6 +820,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_readdirargs,
.pc_encode = nfssvc_encode_readdirres,
.pc_argsize = sizeof(struct nfsd_readdirargs),
+ .pc_argzero = sizeof(struct nfsd_readdirargs),
.pc_ressize = sizeof(struct nfsd_readdirres),
.pc_cachetype = RC_NOCACHE,
.pc_name = "READDIR",
@@ -811,6 +830,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_statfsres,
.pc_argsize = sizeof(struct nfsd_fhandle),
+ .pc_argzero = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_statfsres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+5,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 4bb5baa17040..bfbd9f672f59 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -799,7 +799,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
if (nrservs == 0 && nn->nfsd_serv == NULL)
goto out;
- strlcpy(nn->nfsd_name, utsname()->nodename,
+ strscpy(nn->nfsd_name, utsname()->nodename,
sizeof(nn->nfsd_name));
error = nfsd_create_serv(net);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index aba8520b4b8b..caf6355b18fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -338,10 +338,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
return false;
if (args->len > NFSSVC_MAXBLKSIZE_V2)
return false;
- if (!xdr_stream_subsegment(xdr, &args->payload, args->len))
- return false;
- return true;
+ return xdr_stream_subsegment(xdr, &args->payload, args->len);
}
bool
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 95457cfd37fc..e2daef3cc003 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -57,11 +57,11 @@ typedef struct {
} stateid_t;
typedef struct {
- stateid_t stid;
+ stateid_t cs_stid;
#define NFS4_COPY_STID 1
#define NFS4_COPYNOTIFY_STID 2
- unsigned char sc_type;
- refcount_t sc_count;
+ unsigned char cs_type;
+ refcount_t cs_count;
} copy_stateid_t;
struct nfsd4_callback {
@@ -149,6 +149,7 @@ struct nfs4_delegation {
/* For recall: */
int dl_retries;
struct nfsd4_callback dl_recall;
+ bool dl_recalled;
};
#define cb_to_delegation(cb) \
@@ -174,7 +175,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
/* Maximum number of slots per session. 160 is useful for long haul TCP */
#define NFSD_MAX_SLOTS_PER_SESSION 160
/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND 16
+#define NFSD_MAX_OPS_PER_COMPOUND 50
/* Maximum session per slot cache size */
#define NFSD_SLOT_CACHE_SIZE 2048
/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
@@ -283,6 +284,28 @@ struct nfsd4_sessionid {
#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */
/*
+ * State Meaning Where set
+ * --------------------------------------------------------------------------
+ * | NFSD4_ACTIVE | Confirmed, active | Default |
+ * |------------------- ----------------------------------------------------|
+ * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist |
+ * | | Lease/lock/share | |
+ * | | reservation conflict | |
+ * | | can cause Courtesy | |
+ * | | client to be expired | |
+ * |------------------------------------------------------------------------|
+ * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat |
+ * | | expired by Laundromat| try_to_expire_client |
+ * | | due to conflict | |
+ * |------------------------------------------------------------------------|
+ */
+enum {
+ NFSD4_ACTIVE = 0,
+ NFSD4_COURTESY,
+ NFSD4_EXPIRABLE,
+};
+
+/*
* struct nfs4_client - one per client. Clientids live here.
*
* The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
@@ -385,6 +408,9 @@ struct nfs4_client {
struct list_head async_copies; /* list of async copies */
spinlock_t async_lock; /* lock for async copies */
atomic_t cl_cb_inflight; /* Outstanding callbacks */
+
+ unsigned int cl_state;
+ atomic_t cl_delegs_in_recall;
};
/* struct nfs4_client_reset
@@ -666,18 +692,16 @@ extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
-extern void nfsd4_run_cb(struct nfsd4_callback *cb);
+extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
-extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
struct xdr_netobj princhash, struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
void put_nfs4_file(struct nfs4_file *fi);
-extern void nfs4_put_copy(struct nfsd4_copy *copy);
extern struct nfsd4_copy *
find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
extern void nfs4_put_cpntf_state(struct nfsd_net *nn,
@@ -702,4 +726,9 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
extern void nfsd4_record_grace_done(struct nfsd_net *nn);
+static inline bool try_to_expire_client(struct nfs4_client *clp)
+{
+ cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
+ return clp->cl_state == NFSD4_EXPIRABLE;
+}
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index a8c5a02a84f0..777e24e5da33 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -32,7 +32,7 @@ struct svc_stat nfsd_svcstats = {
.program = &nfsd_program,
};
-static int nfsd_proc_show(struct seq_file *seq, void *v)
+static int nfsd_show(struct seq_file *seq, void *v)
{
int i;
@@ -72,17 +72,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
return 0;
}
-static int nfsd_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nfsd_proc_show, NULL);
-}
-
-static const struct proc_ops nfsd_proc_ops = {
- .proc_open = nfsd_proc_open,
- .proc_read = seq_read,
- .proc_lseek = seq_lseek,
- .proc_release = single_release,
-};
+DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
{
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 242fa123e0e9..06a96e955bd0 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -84,19 +84,26 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
{ NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" })
TRACE_EVENT(nfsd_compound,
- TP_PROTO(const struct svc_rqst *rqst,
- u32 args_opcnt),
- TP_ARGS(rqst, args_opcnt),
+ TP_PROTO(
+ const struct svc_rqst *rqst,
+ const char *tag,
+ u32 taglen,
+ u32 opcnt
+ ),
+ TP_ARGS(rqst, tag, taglen, opcnt),
TP_STRUCT__entry(
__field(u32, xid)
- __field(u32, args_opcnt)
+ __field(u32, opcnt)
+ __string_len(tag, tag, taglen)
),
TP_fast_assign(
__entry->xid = be32_to_cpu(rqst->rq_xid);
- __entry->args_opcnt = args_opcnt;
+ __entry->opcnt = opcnt;
+ __assign_str_len(tag, tag, taglen);
),
- TP_printk("xid=0x%08x opcnt=%u",
- __entry->xid, __entry->args_opcnt)
+ TP_printk("xid=0x%08x opcnt=%u tag=%s",
+ __entry->xid, __entry->opcnt, __get_str(tag)
+ )
)
TRACE_EVENT(nfsd_compound_status,
@@ -171,6 +178,94 @@ TRACE_EVENT(nfsd_compound_encode_err,
__entry->opnum, __entry->status)
);
+#define show_fs_file_type(x) \
+ __print_symbolic(x, \
+ { S_IFLNK, "LNK" }, \
+ { S_IFREG, "REG" }, \
+ { S_IFDIR, "DIR" }, \
+ { S_IFCHR, "CHR" }, \
+ { S_IFBLK, "BLK" }, \
+ { S_IFIFO, "FIFO" }, \
+ { S_IFSOCK, "SOCK" })
+
+TRACE_EVENT(nfsd_fh_verify,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ umode_t type,
+ int access
+ ),
+ TP_ARGS(rqstp, fhp, type, access),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __sockaddr(server, rqstp->rq_xprt->xpt_remotelen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(const void *, inode)
+ __field(unsigned long, type)
+ __field(unsigned long, access)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+ __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local,
+ rqstp->rq_xprt->xpt_locallen);
+ __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->inode = d_inode(fhp->fh_dentry);
+ __entry->type = type;
+ __entry->access = access;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s",
+ __entry->xid, __entry->fh_hash,
+ show_fs_file_type(__entry->type),
+ show_nfsd_may_flags(__entry->access)
+ )
+);
+
+TRACE_EVENT_CONDITION(nfsd_fh_verify_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ umode_t type,
+ int access,
+ __be32 error
+ ),
+ TP_ARGS(rqstp, fhp, type, access, error),
+ TP_CONDITION(error),
+ TP_STRUCT__entry(
+ __field(unsigned int, netns_ino)
+ __sockaddr(server, rqstp->rq_xprt->xpt_remotelen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
+ __field(u32, xid)
+ __field(u32, fh_hash)
+ __field(const void *, inode)
+ __field(unsigned long, type)
+ __field(unsigned long, access)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+ __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local,
+ rqstp->rq_xprt->xpt_locallen);
+ __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->inode = d_inode(fhp->fh_dentry);
+ __entry->type = type;
+ __entry->access = access;
+ __entry->error = be32_to_cpu(error);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s error=%d",
+ __entry->xid, __entry->fh_hash,
+ show_fs_file_type(__entry->type),
+ show_nfsd_may_flags(__entry->access),
+ __entry->error
+ )
+);
DECLARE_EVENT_CLASS(nfsd_fh_err_class,
TP_PROTO(struct svc_rqst *rqstp,
@@ -443,6 +538,29 @@ DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err);
#include "filecache.h"
#include "vfs.h"
+TRACE_EVENT(nfsd_delegret_wakeup,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ long timeo
+ ),
+ TP_ARGS(rqstp, inode, timeo),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(const void *, inode)
+ __field(long, timeo)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->inode = inode;
+ __entry->timeo = timeo;
+ ),
+ TP_printk("xid=0x%08x inode=%p%s",
+ __entry->xid, __entry->inode,
+ __entry->timeo == 0 ? " (timed out)" : ""
+ )
+);
+
DECLARE_EVENT_CLASS(nfsd_stateid_class,
TP_PROTO(stateid_t *stp),
TP_ARGS(stp),
@@ -692,25 +810,16 @@ DEFINE_CLID_EVENT(confirmed_r);
/*
* from fs/nfsd/filecache.h
*/
-TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
-TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE);
-TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED);
-
#define show_nf_flags(val) \
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
{ 1 << NFSD_FILE_PENDING, "PENDING" }, \
- { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \
- { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \
{ 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
DECLARE_EVENT_CLASS(nfsd_file_class,
TP_PROTO(struct nfsd_file *nf),
TP_ARGS(nf),
TP_STRUCT__entry(
- __field(unsigned int, nf_hashval)
__field(void *, nf_inode)
__field(int, nf_ref)
__field(unsigned long, nf_flags)
@@ -718,15 +827,13 @@ DECLARE_EVENT_CLASS(nfsd_file_class,
__field(struct file *, nf_file)
),
TP_fast_assign(
- __entry->nf_hashval = nf->nf_hashval;
__entry->nf_inode = nf->nf_inode;
__entry->nf_ref = refcount_read(&nf->nf_ref);
__entry->nf_flags = nf->nf_flags;
__entry->nf_may = nf->nf_may;
__entry->nf_file = nf->nf_file;
),
- TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p",
- __entry->nf_hashval,
+ TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p",
__entry->nf_inode,
__entry->nf_ref,
show_nf_flags(__entry->nf_flags),
@@ -739,34 +846,59 @@ DEFINE_EVENT(nfsd_file_class, name, \
TP_PROTO(struct nfsd_file *nf), \
TP_ARGS(nf))
-DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
+
+TRACE_EVENT(nfsd_file_alloc,
+ TP_PROTO(
+ const struct nfsd_file *nf
+ ),
+ TP_ARGS(nf),
+ TP_STRUCT__entry(
+ __field(const void *, nf_inode)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(unsigned int, nf_ref)
+ ),
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_may = nf->nf_may;
+ ),
+ TP_printk("inode=%p ref=%u flags=%s may=%s",
+ __entry->nf_inode, __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may)
+ )
+);
TRACE_EVENT(nfsd_file_acquire,
- TP_PROTO(struct svc_rqst *rqstp, unsigned int hash,
- struct inode *inode, unsigned int may_flags,
- struct nfsd_file *nf, __be32 status),
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ unsigned int may_flags,
+ const struct nfsd_file *nf,
+ __be32 status
+ ),
- TP_ARGS(rqstp, hash, inode, may_flags, nf, status),
+ TP_ARGS(rqstp, inode, may_flags, nf, status),
TP_STRUCT__entry(
__field(u32, xid)
- __field(unsigned int, hash)
- __field(void *, inode)
+ __field(const void *, inode)
__field(unsigned long, may_flags)
- __field(int, nf_ref)
+ __field(unsigned int, nf_ref)
__field(unsigned long, nf_flags)
__field(unsigned long, nf_may)
- __field(struct file *, nf_file)
+ __field(const void *, nf_file)
__field(u32, status)
),
TP_fast_assign(
__entry->xid = be32_to_cpu(rqstp->rq_xid);
- __entry->hash = hash;
__entry->inode = inode;
__entry->may_flags = may_flags;
__entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0;
@@ -776,39 +908,186 @@ TRACE_EVENT(nfsd_file_acquire,
__entry->status = be32_to_cpu(status);
),
- TP_printk("xid=0x%x hash=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u",
- __entry->xid, __entry->hash, __entry->inode,
+ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p status=%u",
+ __entry->xid, __entry->inode,
show_nfsd_may_flags(__entry->may_flags),
__entry->nf_ref, show_nf_flags(__entry->nf_flags),
show_nfsd_may_flags(__entry->nf_may),
- __entry->nf_file, __entry->status)
+ __entry->nf_file, __entry->status
+ )
);
+TRACE_EVENT(nfsd_file_create,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ unsigned int may_flags,
+ const struct nfsd_file *nf
+ ),
+
+ TP_ARGS(rqstp, may_flags, nf),
+
+ TP_STRUCT__entry(
+ __field(const void *, nf_inode)
+ __field(const void *, nf_file)
+ __field(unsigned long, may_flags)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(unsigned int, nf_ref)
+ __field(u32, xid)
+ ),
+
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_file = nf->nf_file;
+ __entry->may_flags = may_flags;
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ ),
+
+ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p",
+ __entry->xid, __entry->nf_inode,
+ show_nfsd_may_flags(__entry->may_flags),
+ __entry->nf_ref, show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may), __entry->nf_file
+ )
+);
+
+TRACE_EVENT(nfsd_file_insert_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ unsigned int may_flags,
+ long error
+ ),
+ TP_ARGS(rqstp, inode, may_flags, error),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(const void *, inode)
+ __field(unsigned long, may_flags)
+ __field(long, error)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->inode = inode;
+ __entry->may_flags = may_flags;
+ __entry->error = error;
+ ),
+ TP_printk("xid=0x%x inode=%p may_flags=%s error=%ld",
+ __entry->xid, __entry->inode,
+ show_nfsd_may_flags(__entry->may_flags),
+ __entry->error
+ )
+);
+
+TRACE_EVENT(nfsd_file_cons_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct inode *inode,
+ unsigned int may_flags,
+ const struct nfsd_file *nf
+ ),
+ TP_ARGS(rqstp, inode, may_flags, nf),
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(const void *, inode)
+ __field(unsigned long, may_flags)
+ __field(unsigned int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(const void *, nf_file)
+ ),
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->inode = inode;
+ __entry->may_flags = may_flags;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p",
+ __entry->xid, __entry->inode,
+ show_nfsd_may_flags(__entry->may_flags), __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may), __entry->nf_file
+ )
+);
+
+TRACE_EVENT(nfsd_file_open,
+ TP_PROTO(struct nfsd_file *nf, __be32 status),
+ TP_ARGS(nf, status),
+ TP_STRUCT__entry(
+ __field(void *, nf_inode) /* cannot be dereferenced */
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ __field(unsigned long, nf_may)
+ __field(void *, nf_file) /* cannot be dereferenced */
+ ),
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("inode=%p ref=%d flags=%s may=%s file=%p",
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may),
+ __entry->nf_file)
+)
+
DECLARE_EVENT_CLASS(nfsd_file_search_class,
- TP_PROTO(struct inode *inode, unsigned int hash, int found),
- TP_ARGS(inode, hash, found),
+ TP_PROTO(
+ const struct inode *inode,
+ unsigned int count
+ ),
+ TP_ARGS(inode, count),
TP_STRUCT__entry(
- __field(struct inode *, inode)
- __field(unsigned int, hash)
- __field(int, found)
+ __field(const struct inode *, inode)
+ __field(unsigned int, count)
),
TP_fast_assign(
__entry->inode = inode;
- __entry->hash = hash;
- __entry->found = found;
+ __entry->count = count;
),
- TP_printk("hash=0x%x inode=%p found=%d", __entry->hash,
- __entry->inode, __entry->found)
+ TP_printk("inode=%p count=%u",
+ __entry->inode, __entry->count)
);
#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
DEFINE_EVENT(nfsd_file_search_class, name, \
- TP_PROTO(struct inode *inode, unsigned int hash, int found), \
- TP_ARGS(inode, hash, found))
+ TP_PROTO( \
+ const struct inode *inode, \
+ unsigned int count \
+ ), \
+ TP_ARGS(inode, count))
DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached);
+
+TRACE_EVENT(nfsd_file_is_cached,
+ TP_PROTO(
+ const struct inode *inode,
+ int found
+ ),
+ TP_ARGS(inode, found),
+ TP_STRUCT__entry(
+ __field(const struct inode *, inode)
+ __field(int, found)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ __entry->found = found;
+ ),
+ TP_printk("inode=%p is %scached",
+ __entry->inode,
+ __entry->found ? "" : "not "
+ )
+);
TRACE_EVENT(nfsd_file_fsnotify_handle_event,
TP_PROTO(struct inode *inode, u32 mask),
@@ -829,6 +1108,76 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
__entry->nlink, __entry->mode, __entry->mask)
);
+DECLARE_EVENT_CLASS(nfsd_file_gc_class,
+ TP_PROTO(
+ const struct nfsd_file *nf
+ ),
+ TP_ARGS(nf),
+ TP_STRUCT__entry(
+ __field(void *, nf_inode)
+ __field(void *, nf_file)
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+ ),
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_file = nf->nf_file;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ ),
+ TP_printk("inode=%p ref=%d nf_flags=%s nf_file=%p",
+ __entry->nf_inode, __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ __entry->nf_file
+ )
+);
+
+#define DEFINE_NFSD_FILE_GC_EVENT(name) \
+DEFINE_EVENT(nfsd_file_gc_class, name, \
+ TP_PROTO( \
+ const struct nfsd_file *nf \
+ ), \
+ TP_ARGS(nf))
+
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_hashed);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
+
+DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
+ TP_PROTO(
+ unsigned long removed,
+ unsigned long remaining
+ ),
+ TP_ARGS(removed, remaining),
+ TP_STRUCT__entry(
+ __field(unsigned long, removed)
+ __field(unsigned long, remaining)
+ ),
+ TP_fast_assign(
+ __entry->removed = removed;
+ __entry->remaining = remaining;
+ ),
+ TP_printk("%lu entries removed, %lu remaining",
+ __entry->removed, __entry->remaining)
+);
+
+#define DEFINE_NFSD_FILE_LRUWALK_EVENT(name) \
+DEFINE_EVENT(nfsd_file_lruwalk_class, name, \
+ TP_PROTO( \
+ unsigned long removed, \
+ unsigned long remaining \
+ ), \
+ TP_ARGS(removed, remaining))
+
+DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
+DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
+
#include "cache.h"
TRACE_DEFINE_ENUM(RC_DROPIT);
@@ -1122,6 +1471,45 @@ TRACE_EVENT(nfsd_cb_offload,
__entry->fh_hash, __entry->count, __entry->status)
);
+DECLARE_EVENT_CLASS(nfsd_cb_done_class,
+ TP_PROTO(
+ const stateid_t *stp,
+ const struct rpc_task *task
+ ),
+ TP_ARGS(stp, task),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ __entry->status = task->tk_status;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x status=%d",
+ __entry->cl_boot, __entry->cl_id, __entry->si_id,
+ __entry->si_generation, __entry->status
+ )
+);
+
+#define DEFINE_NFSD_CB_DONE_EVENT(name) \
+DEFINE_EVENT(nfsd_cb_done_class, name, \
+ TP_PROTO( \
+ const stateid_t *stp, \
+ const struct rpc_task *task \
+ ), \
+ TP_ARGS(stp, task))
+
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c22ad0532e8e..f650afedd67f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -199,27 +199,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_nfserr;
}
} else {
- /*
- * In the nfsd4_open() case, this may be held across
- * subsequent open and delegation acquisition which may
- * need to take the child's i_mutex:
- */
- fh_lock_nested(fhp, I_MUTEX_PARENT);
- dentry = lookup_one_len(name, dparent, len);
+ dentry = lookup_one_len_unlocked(name, dparent, len);
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
if (nfsd_mountpoint(dentry, exp)) {
- /*
- * We don't need the i_mutex after all. It's
- * still possible we could open this (regular
- * files can be mountpoints too), but the
- * i_mutex is just there to prevent renames of
- * something that we might be about to delegate,
- * and a mountpoint won't be renamed:
- */
- fh_unlock(fhp);
- if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+ host_err = nfsd_cross_mnt(rqstp, &dentry, &exp);
+ if (host_err) {
dput(dentry);
goto out_nfserr;
}
@@ -234,7 +220,15 @@ out_nfserr:
return nfserrno(host_err);
}
-/*
+/**
+ * nfsd_lookup - look up a single path component for nfsd
+ *
+ * @rqstp: the request context
+ * @fhp: the file handle of the directory
+ * @name: the component name, or %NULL to look up parent
+ * @len: length of name to examine
+ * @resfh: pointer to pre-initialised filehandle to hold result.
+ *
* Look up one component of a pathname.
* N.B. After this call _both_ fhp and resfh need an fh_put
*
@@ -244,11 +238,11 @@ out_nfserr:
* returned. Otherwise the covered directory is returned.
* NOTE: this mountpoint crossing is not supported properly by all
* clients and is explicitly disallowed for NFSv3
- * NeilBrown <neilb@cse.unsw.edu.au>
+ *
*/
__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
- unsigned int len, struct svc_fh *resfh)
+ unsigned int len, struct svc_fh *resfh)
{
struct svc_export *exp;
struct dentry *dentry;
@@ -306,6 +300,10 @@ commit_metadata(struct svc_fh *fhp)
static void
nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
{
+ /* Ignore mode updates on symlinks */
+ if (S_ISLNK(inode->i_mode))
+ iap->ia_valid &= ~ATTR_MODE;
+
/* sanitize the mode change */
if (iap->ia_valid & ATTR_MODE) {
iap->ia_mode &= S_IALLUGO;
@@ -345,21 +343,77 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
return nfserrno(get_write_access(inode));
}
-/*
- * Set various file attributes. After this call fhp needs an fh_put.
+static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
+{
+ int host_err;
+
+ if (iap->ia_valid & ATTR_SIZE) {
+ /*
+ * RFC5661, Section 18.30.4:
+ * Changing the size of a file with SETATTR indirectly
+ * changes the time_modify and change attributes.
+ *
+ * (and similar for the older RFCs)
+ */
+ struct iattr size_attr = {
+ .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+ .ia_size = iap->ia_size,
+ };
+
+ if (iap->ia_size < 0)
+ return -EFBIG;
+
+ host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
+ if (host_err)
+ return host_err;
+ iap->ia_valid &= ~ATTR_SIZE;
+
+ /*
+ * Avoid the additional setattr call below if the only other
+ * attribute that the client sends is the mtime, as we update
+ * it as part of the size change above.
+ */
+ if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+ return 0;
+ }
+
+ if (!iap->ia_valid)
+ return 0;
+
+ iap->ia_valid |= ATTR_CTIME;
+ return notify_change(&init_user_ns, dentry, iap, NULL);
+}
+
+/**
+ * nfsd_setattr - Set various file attributes.
+ * @rqstp: controlling RPC transaction
+ * @fhp: filehandle of target
+ * @attr: attributes to set
+ * @check_guard: set to 1 if guardtime is a valid timestamp
+ * @guardtime: do not act if ctime.tv_sec does not match this timestamp
+ *
+ * This call may adjust the contents of @attr (in particular, this
+ * call may change the bits in the na_iattr.ia_valid field).
+ *
+ * Returns nfs_ok on success, otherwise an NFS status code is
+ * returned. Caller must release @fhp by calling fh_put in either
+ * case.
*/
__be32
-nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_attrs *attr,
int check_guard, time64_t guardtime)
{
struct dentry *dentry;
struct inode *inode;
+ struct iattr *iap = attr->na_iattr;
int accmode = NFSD_MAY_SATTR;
umode_t ftype = 0;
__be32 err;
int host_err;
bool get_write_count;
bool size_change = (iap->ia_valid & ATTR_SIZE);
+ int retries;
if (iap->ia_valid & ATTR_SIZE) {
accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
@@ -395,13 +449,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
dentry = fhp->fh_dentry;
inode = d_inode(dentry);
- /* Ignore any mode updates on symlinks */
- if (S_ISLNK(inode->i_mode))
- iap->ia_valid &= ~ATTR_MODE;
-
- if (!iap->ia_valid)
- return 0;
-
nfsd_sanitize_attrs(inode, iap);
if (check_guard && guardtime != inode->i_ctime.tv_sec)
@@ -420,43 +467,27 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
return err;
}
- fh_lock(fhp);
- if (size_change) {
- /*
- * RFC5661, Section 18.30.4:
- * Changing the size of a file with SETATTR indirectly
- * changes the time_modify and change attributes.
- *
- * (and similar for the older RFCs)
- */
- struct iattr size_attr = {
- .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
- .ia_size = iap->ia_size,
- };
-
- host_err = -EFBIG;
- if (iap->ia_size < 0)
- goto out_unlock;
-
- host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
- if (host_err)
- goto out_unlock;
- iap->ia_valid &= ~ATTR_SIZE;
-
- /*
- * Avoid the additional setattr call below if the only other
- * attribute that the client sends is the mtime, as we update
- * it as part of the size change above.
- */
- if ((iap->ia_valid & ~ATTR_MTIME) == 0)
- goto out_unlock;
+ inode_lock(inode);
+ for (retries = 1;;) {
+ host_err = __nfsd_setattr(dentry, iap);
+ if (host_err != -EAGAIN || !retries--)
+ break;
+ if (!nfsd_wait_for_delegreturn(rqstp, inode))
+ break;
}
-
- iap->ia_valid |= ATTR_CTIME;
- host_err = notify_change(&init_user_ns, dentry, iap, NULL);
-
-out_unlock:
- fh_unlock(fhp);
+ if (attr->na_seclabel && attr->na_seclabel->len)
+ attr->na_labelerr = security_inode_setsecctx(dentry,
+ attr->na_seclabel->data, attr->na_seclabel->len);
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
+ attr->na_aclerr = set_posix_acl(&init_user_ns,
+ inode, ACL_TYPE_ACCESS,
+ attr->na_pacl);
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
+ !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
+ attr->na_aclerr = set_posix_acl(&init_user_ns,
+ inode, ACL_TYPE_DEFAULT,
+ attr->na_dpacl);
+ inode_unlock(inode);
if (size_change)
put_write_access(inode);
out:
@@ -494,32 +525,6 @@ int nfsd4_is_junction(struct dentry *dentry)
return 0;
return 1;
}
-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct xdr_netobj *label)
-{
- __be32 error;
- int host_error;
- struct dentry *dentry;
-
- error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
- if (error)
- return error;
-
- dentry = fhp->fh_dentry;
-
- inode_lock(d_inode(dentry));
- host_error = security_inode_setsecctx(dentry, label->data, label->len);
- inode_unlock(d_inode(dentry));
- return nfserrno(host_error);
-}
-#else
-__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct xdr_netobj *label)
-{
- return nfserr_notsupp;
-}
-#endif
static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp)
{
@@ -577,6 +582,7 @@ out_err:
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count)
{
+ ssize_t ret;
/*
* Limit copy to 4MB to prevent indefinitely blocking an nfsd
@@ -587,7 +593,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
* limit like this and pipeline multiple COPY requests.
*/
count = min_t(u64, count, 1 << 22);
- return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+ ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
+ count, 0);
+ return ret;
}
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -827,14 +838,23 @@ retry:
return err;
}
+/**
+ * nfsd_open_verified - Open a regular file for the filecache
+ * @rqstp: RPC request
+ * @fhp: NFS filehandle of the file to open
+ * @may_flags: internal permission flags
+ * @filp: OUT: open "struct file *"
+ *
+ * Returns an nfsstat value in network byte order.
+ */
__be32
-nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
- int may_flags, struct file **filp)
+nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
+ struct file **filp)
{
__be32 err;
validate_process_creds();
- err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
+ err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
validate_process_creds();
return err;
}
@@ -849,17 +869,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct svc_rqst *rqstp = sd->u.data;
- struct page **pp = rqstp->rq_next_page;
- struct page *page = buf->page;
-
- if (rqstp->rq_res.page_len == 0) {
- svc_rqst_replace_page(rqstp, page);
- rqstp->rq_res.page_base = buf->offset;
- } else if (page != pp[-1]) {
- svc_rqst_replace_page(rqstp, page);
- }
+ struct page *page = buf->page; // may be a compound one
+ unsigned offset = buf->offset;
+
+ page += offset / PAGE_SIZE;
+ for (int i = sd->len; i > 0; i -= PAGE_SIZE)
+ svc_rqst_replace_page(rqstp, page++);
+ if (rqstp->rq_res.page_len == 0) // first call
+ rqstp->rq_res.page_base = offset % PAGE_SIZE;
rqstp->rq_res.page_len += sd->len;
-
return sd->len;
}
@@ -1170,6 +1188,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
nfsd_copy_write_verifier(verf, nn);
err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
since);
+ err = nfserrno(err2);
break;
case -EINVAL:
err = nfserr_notsupp;
@@ -1177,8 +1196,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
default:
nfsd_reset_write_verifier(nn);
trace_nfsd_writeverf_reset(nn, rqstp, err2);
+ err = nfserrno(err2);
}
- err = nfserrno(err2);
} else
nfsd_copy_write_verifier(verf, nn);
@@ -1187,14 +1206,27 @@ out:
return err;
}
-static __be32
-nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
- struct iattr *iap)
+/**
+ * nfsd_create_setattr - Set a created file's attributes
+ * @rqstp: RPC transaction being executed
+ * @fhp: NFS filehandle of parent directory
+ * @resfhp: NFS filehandle of new object
+ * @attrs: requested attributes of new object
+ *
+ * Returns nfs_ok on success, or an nfsstat in network byte order.
+ */
+__be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd_attrs *attrs)
{
+ struct iattr *iap = attrs->na_iattr;
+ __be32 status;
+
/*
- * Mode has already been set earlier in create:
+ * Mode has already been set by file creation.
*/
iap->ia_valid &= ~ATTR_MODE;
+
/*
* Setting uid/gid works only for root. Irix appears to
* send along the gid on create when it tries to implement
@@ -1202,10 +1234,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
*/
if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+
+ /*
+ * Callers expect new file metadata to be committed even
+ * if the attributes have not changed.
+ */
if (iap->ia_valid)
- return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
- /* Callers expect file metadata to be committed here */
- return nfserrno(commit_metadata(resfhp));
+ status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0);
+ else
+ status = nfserrno(commit_metadata(resfhp));
+
+ /*
+ * Transactional filesystems had a chance to commit changes
+ * for both parent and child simultaneously making the
+ * following commit_metadata a noop in many cases.
+ */
+ if (!status)
+ status = nfserrno(commit_metadata(fhp));
+
+ /*
+ * Update the new filehandle to pick up the new attributes.
+ */
+ if (!status)
+ status = fh_update(resfhp);
+
+ return status;
}
/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
@@ -1226,26 +1279,19 @@ nfsd_check_ignore_resizing(struct iattr *iap)
/* The parent directory should already be locked: */
__be32
nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct iattr *iap,
- int type, dev_t rdev, struct svc_fh *resfhp)
+ struct nfsd_attrs *attrs,
+ int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild;
struct inode *dirp;
+ struct iattr *iap = attrs->na_iattr;
__be32 err;
- __be32 err2;
int host_err;
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
dchild = dget(resfhp->fh_dentry);
- if (!fhp->fh_locked) {
- WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
- dentry);
- err = nfserr_io;
- goto out;
- }
-
err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1305,22 +1351,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err < 0)
goto out_nfserr;
- err = nfsd_create_setattr(rqstp, resfhp, iap);
+ err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
- /*
- * nfsd_create_setattr already committed the child. Transactional
- * filesystems had a chance to commit changes for both parent and
- * child simultaneously making the following commit_metadata a
- * noop.
- */
- err2 = nfserrno(commit_metadata(fhp));
- if (err2)
- err = err2;
- /*
- * Update the file handle to get the new inode info.
- */
- if (!err)
- err = fh_update(resfhp);
out:
dput(dchild);
return err;
@@ -1338,8 +1370,8 @@ out_nfserr:
*/
__be32
nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct iattr *iap,
- int type, dev_t rdev, struct svc_fh *resfhp)
+ char *fname, int flen, struct nfsd_attrs *attrs,
+ int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild = NULL;
__be32 err;
@@ -1358,11 +1390,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err)
return nfserrno(host_err);
- fh_lock_nested(fhp, I_MUTEX_PARENT);
+ inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
dchild = lookup_one_len(fname, dentry, flen);
host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- return nfserrno(host_err);
+ if (IS_ERR(dchild)) {
+ err = nfserrno(host_err);
+ goto out_unlock;
+ }
err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
/*
* We unconditionally drop our ref to dchild as fh_compose will have
@@ -1370,175 +1404,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*/
dput(dchild);
if (err)
- return err;
- return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
- rdev, resfhp);
-}
-
-/*
- * NFSv3 and NFSv4 version of nfsd_create
- */
-__be32
-do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen, struct iattr *iap,
- struct svc_fh *resfhp, int createmode, u32 *verifier,
- bool *truncp, bool *created)
-{
- struct dentry *dentry, *dchild = NULL;
- struct inode *dirp;
- __be32 err;
- int host_err;
- __u32 v_mtime=0, v_atime=0;
-
- err = nfserr_perm;
- if (!flen)
- goto out;
- err = nfserr_exist;
- if (isdotent(fname, flen))
- goto out;
- if (!(iap->ia_valid & ATTR_MODE))
- iap->ia_mode = 0;
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
- if (err)
- goto out;
-
- dentry = fhp->fh_dentry;
- dirp = d_inode(dentry);
-
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
-
- fh_lock_nested(fhp, I_MUTEX_PARENT);
-
- /*
- * Compose the response file handle.
- */
- dchild = lookup_one_len(fname, dentry, flen);
- host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- goto out_nfserr;
-
- /* If file doesn't exist, check for permissions to create one */
- if (d_really_is_negative(dchild)) {
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
- if (err)
- goto out;
- }
-
- err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- if (err)
- goto out;
-
- if (nfsd_create_is_exclusive(createmode)) {
- /* solaris7 gets confused (bugid 4218508) if these have
- * the high bit set, as do xfs filesystems without the
- * "bigtime" feature. So just clear the high bits. If this is
- * ever changed to use different attrs for storing the
- * verifier, then do_open_lookup() will also need to be fixed
- * accordingly.
- */
- v_mtime = verifier[0]&0x7fffffff;
- v_atime = verifier[1]&0x7fffffff;
- }
-
- if (d_really_is_positive(dchild)) {
- err = 0;
-
- switch (createmode) {
- case NFS3_CREATE_UNCHECKED:
- if (! d_is_reg(dchild))
- goto out;
- else if (truncp) {
- /* in nfsv4, we need to treat this case a little
- * differently. we don't want to truncate the
- * file now; this would be wrong if the OPEN
- * fails for some other reason. furthermore,
- * if the size is nonzero, we should ignore it
- * according to spec!
- */
- *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
- }
- else {
- iap->ia_valid &= ATTR_SIZE;
- goto set_attr;
- }
- break;
- case NFS3_CREATE_EXCLUSIVE:
- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
- && d_inode(dchild)->i_atime.tv_sec == v_atime
- && d_inode(dchild)->i_size == 0 ) {
- if (created)
- *created = true;
- break;
- }
- fallthrough;
- case NFS4_CREATE_EXCLUSIVE4_1:
- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
- && d_inode(dchild)->i_atime.tv_sec == v_atime
- && d_inode(dchild)->i_size == 0 ) {
- if (created)
- *created = true;
- goto set_attr;
- }
- fallthrough;
- case NFS3_CREATE_GUARDED:
- err = nfserr_exist;
- }
- fh_drop_write(fhp);
- goto out;
- }
-
- if (!IS_POSIXACL(dirp))
- iap->ia_mode &= ~current_umask();
-
- host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
- if (host_err < 0) {
- fh_drop_write(fhp);
- goto out_nfserr;
- }
- if (created)
- *created = true;
-
- nfsd_check_ignore_resizing(iap);
-
- if (nfsd_create_is_exclusive(createmode)) {
- /* Cram the verifier into atime/mtime */
- iap->ia_valid = ATTR_MTIME|ATTR_ATIME
- | ATTR_MTIME_SET|ATTR_ATIME_SET;
- /* XXX someone who knows this better please fix it for nsec */
- iap->ia_mtime.tv_sec = v_mtime;
- iap->ia_atime.tv_sec = v_atime;
- iap->ia_mtime.tv_nsec = 0;
- iap->ia_atime.tv_nsec = 0;
- }
-
- set_attr:
- err = nfsd_create_setattr(rqstp, resfhp, iap);
-
- /*
- * nfsd_create_setattr already committed the child
- * (and possibly also the parent).
- */
- if (!err)
- err = nfserrno(commit_metadata(fhp));
-
- /*
- * Update the filehandle to get the new inode info.
- */
- if (!err)
- err = fh_update(resfhp);
-
- out:
- fh_unlock(fhp);
- if (dchild && !IS_ERR(dchild))
- dput(dchild);
- fh_drop_write(fhp);
- return err;
-
- out_nfserr:
- err = nfserrno(host_err);
- goto out;
+ goto out_unlock;
+ fh_fill_pre_attrs(fhp);
+ err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
+ fh_fill_post_attrs(fhp);
+out_unlock:
+ inode_unlock(dentry->d_inode);
+ return err;
}
/*
@@ -1579,15 +1451,25 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
return 0;
}
-/*
- * Create a symlink and look up its inode
+/**
+ * nfsd_symlink - Create a symlink and look up its inode
+ * @rqstp: RPC transaction being executed
+ * @fhp: NFS filehandle of parent directory
+ * @fname: filename of the new symlink
+ * @flen: length of @fname
+ * @path: content of the new symlink (NUL-terminated)
+ * @attrs: requested attributes of new object
+ * @resfhp: NFS filehandle of new object
+ *
* N.B. After this call _both_ fhp and resfhp need an fh_put
+ *
+ * Returns nfs_ok on success, or an nfsstat in network byte order.
*/
__be32
nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
- char *fname, int flen,
- char *path,
- struct svc_fh *resfhp)
+ char *fname, int flen,
+ char *path, struct nfsd_attrs *attrs,
+ struct svc_fh *resfhp)
{
struct dentry *dentry, *dnew;
__be32 err, cerr;
@@ -1605,33 +1487,35 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out;
+ }
- fh_lock(fhp);
dentry = fhp->fh_dentry;
+ inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
dnew = lookup_one_len(fname, dentry, flen);
- host_err = PTR_ERR(dnew);
- if (IS_ERR(dnew))
- goto out_nfserr;
-
+ if (IS_ERR(dnew)) {
+ err = nfserrno(PTR_ERR(dnew));
+ inode_unlock(dentry->d_inode);
+ goto out_drop_write;
+ }
+ fh_fill_pre_attrs(fhp);
host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
err = nfserrno(host_err);
- fh_unlock(fhp);
+ cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
+ if (!err)
+ nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
+ fh_fill_post_attrs(fhp);
+ inode_unlock(dentry->d_inode);
if (!err)
err = nfserrno(commit_metadata(fhp));
-
- fh_drop_write(fhp);
-
- cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
dput(dnew);
if (err==0) err = cerr;
+out_drop_write:
+ fh_drop_write(fhp);
out:
return err;
-
-out_nfserr:
- err = nfserrno(host_err);
- goto out;
}
/*
@@ -1669,22 +1553,25 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
goto out;
}
- fh_lock_nested(ffhp, I_MUTEX_PARENT);
ddir = ffhp->fh_dentry;
dirp = d_inode(ddir);
+ inode_lock_nested(dirp, I_MUTEX_PARENT);
dnew = lookup_one_len(name, ddir, len);
- host_err = PTR_ERR(dnew);
- if (IS_ERR(dnew))
- goto out_nfserr;
+ if (IS_ERR(dnew)) {
+ err = nfserrno(PTR_ERR(dnew));
+ goto out_unlock;
+ }
dold = tfhp->fh_dentry;
err = nfserr_noent;
if (d_really_is_negative(dold))
goto out_dput;
+ fh_fill_pre_attrs(ffhp);
host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
- fh_unlock(ffhp);
+ fh_fill_post_attrs(ffhp);
+ inode_unlock(dirp);
if (!host_err) {
err = nfserrno(commit_metadata(ffhp));
if (!err)
@@ -1695,17 +1582,17 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
else
err = nfserrno(host_err);
}
-out_dput:
dput(dnew);
-out_unlock:
- fh_unlock(ffhp);
+out_drop_write:
fh_drop_write(tfhp);
out:
return err;
-out_nfserr:
- err = nfserrno(host_err);
- goto out_unlock;
+out_dput:
+ dput(dnew);
+out_unlock:
+ inode_unlock(dirp);
+ goto out_drop_write;
}
static void
@@ -1766,10 +1653,7 @@ retry:
goto out;
}
- /* cannot use fh_lock as we need deadlock protective ordering
- * so do it by hand */
trap = lock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = true;
fh_fill_pre_attrs(ffhp);
fh_fill_pre_attrs(tfhp);
@@ -1812,7 +1696,15 @@ retry:
.new_dir = tdir,
.new_dentry = ndentry,
};
- host_err = vfs_rename(&rd);
+ int retries;
+
+ for (retries = 1;;) {
+ host_err = vfs_rename(&rd);
+ if (host_err != -EAGAIN || !retries--)
+ break;
+ if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry)))
+ break;
+ }
if (!host_err) {
host_err = commit_metadata(tfhp);
if (!host_err)
@@ -1825,17 +1717,12 @@ retry:
dput(odentry);
out_nfserr:
err = nfserrno(host_err);
- /*
- * We cannot rely on fh_unlock on the two filehandles,
- * as that would do the wrong thing if the two directories
- * were the same, so again we do it by hand.
- */
+
if (!close_cached) {
fh_fill_post_attrs(ffhp);
fh_fill_post_attrs(tfhp);
}
unlock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = false;
fh_drop_write(ffhp);
/*
@@ -1879,19 +1766,19 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (host_err)
goto out_nfserr;
- fh_lock_nested(fhp, I_MUTEX_PARENT);
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
+ inode_lock_nested(dirp, I_MUTEX_PARENT);
rdentry = lookup_one_len(fname, dentry, flen);
host_err = PTR_ERR(rdentry);
if (IS_ERR(rdentry))
- goto out_drop_write;
+ goto out_unlock;
if (d_really_is_negative(rdentry)) {
dput(rdentry);
host_err = -ENOENT;
- goto out_drop_write;
+ goto out_unlock;
}
rinode = d_inode(rdentry);
ihold(rinode);
@@ -1899,15 +1786,26 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = d_inode(rdentry)->i_mode & S_IFMT;
+ fh_fill_pre_attrs(fhp);
if (type != S_IFDIR) {
+ int retries;
+
if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
nfsd_close_cached_files(rdentry);
- host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
+
+ for (retries = 1;;) {
+ host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
+ if (host_err != -EAGAIN || !retries--)
+ break;
+ if (!nfsd_wait_for_delegreturn(rqstp, rinode))
+ break;
+ }
} else {
host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
}
+ fh_fill_post_attrs(fhp);
- fh_unlock(fhp);
+ inode_unlock(dirp);
if (!host_err)
host_err = commit_metadata(fhp);
dput(rdentry);
@@ -1929,6 +1827,9 @@ out_nfserr:
}
out:
return err;
+out_unlock:
+ inode_unlock(dirp);
+ goto out_drop_write;
}
/*
@@ -1953,7 +1854,7 @@ struct readdir_data {
int full;
};
-static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -1965,7 +1866,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
if (buf->used + reclen > PAGE_SIZE) {
buf->full = 1;
- return -EINVAL;
+ return false;
}
de->namlen = namlen;
@@ -1975,7 +1876,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
memcpy(de->name, name, namlen);
buf->used += reclen;
- return 0;
+ return true;
}
static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
@@ -2282,13 +2183,16 @@ out:
return err;
}
-/*
- * Removexattr and setxattr need to call fh_lock to both lock the inode
- * and set the change attribute. Since the top-level vfs_removexattr
- * and vfs_setxattr calls already do their own inode_lock calls, call
- * the _locked variant. Pass in a NULL pointer for delegated_inode,
- * and let the client deal with NFS4ERR_DELAY (same as with e.g.
- * setattr and remove).
+/**
+ * nfsd_removexattr - Remove an extended attribute
+ * @rqstp: RPC transaction being executed
+ * @fhp: NFS filehandle of object with xattr to remove
+ * @name: name of xattr to remove (NUL-terminate)
+ *
+ * Pass in a NULL pointer for delegated_inode, and let the client deal
+ * with NFS4ERR_DELAY (same as with e.g. setattr and remove).
+ *
+ * Returns nfs_ok on success, or an nfsstat in network byte order.
*/
__be32
nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
@@ -2304,12 +2208,14 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
if (ret)
return nfserrno(ret);
- fh_lock(fhp);
+ inode_lock(fhp->fh_dentry->d_inode);
+ fh_fill_pre_attrs(fhp);
ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry,
name, NULL);
- fh_unlock(fhp);
+ fh_fill_post_attrs(fhp);
+ inode_unlock(fhp->fh_dentry->d_inode);
fh_drop_write(fhp);
return nfsd_xattr_errno(ret);
@@ -2329,12 +2235,13 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
ret = fh_want_write(fhp);
if (ret)
return nfserrno(ret);
- fh_lock(fhp);
+ inode_lock(fhp->fh_dentry->d_inode);
+ fh_fill_pre_attrs(fhp);
ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf,
len, flags, NULL);
-
- fh_unlock(fhp);
+ fh_fill_post_attrs(fhp);
+ inode_unlock(fhp->fh_dentry->d_inode);
fh_drop_write(fhp);
return nfsd_xattr_errno(ret);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index ccb87b2864f6..120521bc7b24 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -6,6 +6,8 @@
#ifndef LINUX_NFSD_VFS_H
#define LINUX_NFSD_VFS_H
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
#include "nfsfh.h"
#include "nfsd.h"
@@ -42,6 +44,22 @@ struct nfsd_file;
typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
/* nfsd/vfs.c */
+struct nfsd_attrs {
+ struct iattr *na_iattr; /* input */
+ struct xdr_netobj *na_seclabel; /* input */
+ struct posix_acl *na_pacl; /* input */
+ struct posix_acl *na_dpacl; /* input */
+
+ int na_labelerr; /* output */
+ int na_aclerr; /* output */
+};
+
+static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
+{
+ posix_acl_release(attrs->na_pacl);
+ posix_acl_release(attrs->na_dpacl);
+}
+
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct svc_export **expp);
__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
@@ -50,11 +68,9 @@ __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
const char *, unsigned int,
struct svc_export **, struct dentry **);
__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
- struct iattr *, int, time64_t);
+ struct nfsd_attrs *, int, time64_t);
int nfsd_mountpoint(struct dentry *, struct svc_export *);
#ifdef CONFIG_NFSD_V4
-__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
- struct xdr_netobj *);
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
struct file *, loff_t, loff_t, int);
__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
@@ -63,16 +79,14 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
u64 count, bool sync);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct iattr *attrs,
- int type, dev_t rdev, struct svc_fh *res);
+ struct nfsd_attrs *attrs, int type, dev_t rdev,
+ struct svc_fh *res);
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct iattr *attrs,
+ char *name, int len, struct nfsd_attrs *attrs,
int type, dev_t rdev, struct svc_fh *res);
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
-__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
- char *name, int len, struct iattr *attrs,
- struct svc_fh *res, int createmode,
- u32 *verifier, bool *truncp, bool *created);
+__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct svc_fh *resfhp, struct nfsd_attrs *iap);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
u64 offset, u32 count, __be32 *verf);
#ifdef CONFIG_NFSD_V4
@@ -88,7 +102,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int nfsd_open_break_lease(struct inode *, int);
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
+__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
int, struct file **);
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
@@ -112,8 +126,9 @@ __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
char *, int *);
__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
- char *name, int len, char *path,
- struct svc_fh *res);
+ char *name, int len, char *path,
+ struct nfsd_attrs *attrs,
+ struct svc_fh *res);
__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
char *, int, struct svc_fh *);
ssize_t nfsd_copy_file_range(struct file *, u64,
@@ -159,10 +174,4 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
AT_STATX_SYNC_AS_STAT));
}
-static inline int nfsd_create_is_exclusive(int createmode)
-{
- return createmode == NFS3_CREATE_EXCLUSIVE
- || createmode == NFS4_CREATE_EXCLUSIVE4_1;
-}
-
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 846ab6df9d48..0eb00105d845 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -273,11 +273,13 @@ struct nfsd4_open {
bool op_truncate; /* used during processing */
bool op_created; /* used during processing */
struct nfs4_openowner *op_openowner; /* used during processing */
+ struct file *op_filp; /* used during processing */
struct nfs4_file *op_file; /* used during processing */
struct nfs4_ol_stateid *op_stp; /* used during processing */
struct nfs4_clnt_odstate *op_odstate; /* used during processing */
struct nfs4_acl *op_acl;
struct xdr_netobj op_label;
+ struct svc_rqst *op_rqstp;
};
struct nfsd4_open_confirm {
@@ -301,9 +303,10 @@ struct nfsd4_read {
u32 rd_length; /* request */
int rd_vlen;
struct nfsd_file *rd_nf;
-
+
struct svc_rqst *rd_rqstp; /* response */
- struct svc_fh *rd_fhp; /* response */
+ struct svc_fh *rd_fhp; /* response */
+ u32 rd_eof; /* response */
};
struct nfsd4_readdir {
@@ -531,6 +534,13 @@ struct nfsd42_write_res {
stateid_t cb_stateid;
};
+struct nfsd4_cb_offload {
+ struct nfsd4_callback co_cb;
+ struct nfsd42_write_res co_res;
+ __be32 co_nfserr;
+ struct knfsd_fh co_fh;
+};
+
struct nfsd4_copy {
/* request */
stateid_t cp_src_stateid;
@@ -538,18 +548,16 @@ struct nfsd4_copy {
u64 cp_src_pos;
u64 cp_dst_pos;
u64 cp_count;
- struct nl4_server cp_src;
- bool cp_intra;
+ struct nl4_server *cp_src;
- /* both */
- u32 cp_synchronous;
+ unsigned long cp_flags;
+#define NFSD4_COPY_F_STOPPED (0)
+#define NFSD4_COPY_F_INTRA (1)
+#define NFSD4_COPY_F_SYNCHRONOUS (2)
+#define NFSD4_COPY_F_COMMITTED (3)
/* response */
struct nfsd42_write_res cp_res;
-
- /* for cb_offload */
- struct nfsd4_callback cp_cb;
- __be32 nfserr;
struct knfsd_fh fh;
struct nfs4_client *cp_clp;
@@ -562,14 +570,35 @@ struct nfsd4_copy {
struct list_head copies;
struct task_struct *copy_task;
refcount_t refcount;
- bool stopped;
struct vfsmount *ss_mnt;
struct nfs_fh c_fh;
nfs4_stateid stateid;
- bool committed;
};
+static inline void nfsd4_copy_set_sync(struct nfsd4_copy *copy, bool sync)
+{
+ if (sync)
+ set_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+ else
+ clear_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+}
+
+static inline bool nfsd4_copy_is_sync(const struct nfsd4_copy *copy)
+{
+ return test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+}
+
+static inline bool nfsd4_copy_is_async(const struct nfsd4_copy *copy)
+{
+ return !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+}
+
+static inline bool nfsd4_ssc_is_inter(const struct nfsd4_copy *copy)
+{
+ return !test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+}
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -593,19 +622,20 @@ struct nfsd4_offload_status {
struct nfsd4_copy_notify {
/* request */
stateid_t cpn_src_stateid;
- struct nl4_server cpn_dst;
+ struct nl4_server *cpn_dst;
/* response */
stateid_t cpn_cnr_stateid;
u64 cpn_sec;
u32 cpn_nsec;
- struct nl4_server cpn_src;
+ struct nl4_server *cpn_src;
};
struct nfsd4_op {
u32 opnum;
- const struct nfsd4_operation * opdesc;
__be32 status;
+ const struct nfsd4_operation *opdesc;
+ struct nfs4_replay *replay;
union nfsd4_op_u {
struct nfsd4_access access;
struct nfsd4_close close;
@@ -669,7 +699,6 @@ struct nfsd4_op {
struct nfsd4_listxattrs listxattrs;
struct nfsd4_removexattr removexattr;
} u;
- struct nfs4_replay * replay;
};
bool nfsd4_cache_this_op(struct nfsd4_op *);
@@ -688,13 +717,13 @@ struct nfsd4_compoundargs {
struct svcxdr_tmpbuf *to_free;
struct svc_rqst *rqstp;
- u32 taglen;
char * tag;
+ u32 taglen;
u32 minorversion;
+ u32 client_opcnt;
u32 opcnt;
struct nfsd4_op *ops;
struct nfsd4_op iops[8];
- int cachetype;
};
struct nfsd4_compoundres {
@@ -703,8 +732,8 @@ struct nfsd4_compoundres {
struct svc_rqst * rqstp;
__be32 *statusp;
- u32 taglen;
char * tag;
+ u32 taglen;
u32 opcnt;
struct nfsd4_compound_state cstate;
@@ -859,7 +888,8 @@ struct nfsd4_operation {
u32 op_flags;
char *op_name;
/* Try to get response size before operation */
- u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *);
+ u32 (*op_rsize_bop)(const struct svc_rqst *rqstp,
+ const struct nfsd4_op *op);
void (*op_get_currentstateid)(struct nfsd4_compound_state *,
union nfsd4_op_u *);
void (*op_set_currentstateid)(struct nfsd4_compound_state *,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index ca611ac09f7c..e74fda212620 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -70,7 +70,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
}
int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
- sector_t pblocknr, int mode, int mode_flags,
+ sector_t pblocknr, blk_opf_t opf,
struct buffer_head **pbh, sector_t *submit_ptr)
{
struct buffer_head *bh;
@@ -103,13 +103,13 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
}
}
- if (mode_flags & REQ_RAHEAD) {
+ if (opf & REQ_RAHEAD) {
if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
err = -EBUSY; /* internal code */
brelse(bh);
goto out_locked;
}
- } else { /* mode == READ */
+ } else { /* opf == REQ_OP_READ */
lock_buffer(bh);
}
if (buffer_uptodate(bh)) {
@@ -122,7 +122,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
bh->b_blocknr = pblocknr; /* set block address for read */
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(mode, mode_flags, bh);
+ submit_bh(opf, bh);
bh->b_blocknr = blocknr; /* set back to the given block address */
*submit_ptr = pblocknr;
err = 0;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index bd5544e63a01..4bc5612dff94 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -34,8 +34,8 @@ void nilfs_init_btnc_inode(struct inode *btnc_inode);
void nilfs_btnode_cache_clear(struct address_space *);
struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
__u64 blocknr);
-int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
- int, struct buffer_head **, sector_t *);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+ blk_opf_t, struct buffer_head **, sector_t *);
void nilfs_btnode_delete(struct buffer_head *);
int nilfs_btnode_prepare_change_key(struct address_space *,
struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f544c22fff78..9f4d9432d38a 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -477,7 +477,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
sector_t submit_ptr = 0;
int ret;
- ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh,
+ ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh,
&submit_ptr);
if (ret) {
if (ret != -EEXIST)
@@ -495,8 +495,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
ret = nilfs_btnode_submit_block(btnc, ptr2, 0,
- REQ_OP_READ, REQ_RAHEAD,
- &ra_bh, &submit_ptr);
+ REQ_OP_READ | REQ_RAHEAD,
+ &ra_bh, &submit_ptr);
if (likely(!ret || ret == -EEXIST))
brelse(ra_bh);
else if (ret != -EBUSY)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f8f4c2ff52f4..decd6471300b 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -194,7 +194,7 @@ static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
if (!IS_ERR(page)) {
kmap(page);
if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !nilfs_check_page(page))
+ if (!nilfs_check_page(page))
goto fail;
}
}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 04fdd420eae7..b0d22ff24b67 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -92,7 +92,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
bh->b_blocknr = pbn;
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
if (vbn)
bh->b_blocknr = vbn;
out:
@@ -129,9 +129,8 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode;
int ret;
- ret = nilfs_btnode_submit_block(btnc_inode->i_mapping,
- vbn ? : pbn, pbn, REQ_OP_READ, 0,
- out_bh, &pbn);
+ ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, vbn ? : pbn, pbn,
+ REQ_OP_READ, out_bh, &pbn);
if (ret == -EEXIST) /* internal code (cache hit) */
ret = 0;
return ret;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6045cea21f52..67f63cfeade5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -63,10 +63,10 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
/**
* nilfs_get_block() - get a file block on the filesystem (callback function)
- * @inode - inode struct of the target file
- * @blkoff - file block number
- * @bh_result - buffer head to be mapped on
- * @create - indicate whether allocating the block or not when it has not
+ * @inode: inode struct of the target file
+ * @blkoff: file block number
+ * @bh_result: buffer head to be mapped on
+ * @create: indicate whether allocating the block or not when it has not
* been allocated yet.
*
* This function does not issue actual read request of the specified data
@@ -140,14 +140,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
}
/**
- * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * nilfs_read_folio() - implement read_folio() method of nilfs_aops {}
* address_space_operations.
- * @file - file struct of the file to be read
- * @page - the page to be read
+ * @file: file struct of the file to be read
+ * @folio: the folio to be read
*/
-static int nilfs_readpage(struct file *file, struct page *page)
+static int nilfs_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, nilfs_get_block);
+ return mpage_read_folio(folio, nilfs_get_block);
}
static void nilfs_readahead(struct readahead_control *rac)
@@ -248,7 +248,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
}
static int nilfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
@@ -258,8 +258,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(err))
return err;
- err = block_write_begin(mapping, pos, len, flags, pagep,
- nilfs_get_block);
+ err = block_write_begin(mapping, pos, len, pagep, nilfs_get_block);
if (unlikely(err)) {
nilfs_write_failed(mapping, pos + len);
nilfs_transaction_abort(inode->i_sb);
@@ -299,13 +298,12 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
- .readpage = nilfs_readpage,
+ .read_folio = nilfs_read_folio,
.writepages = nilfs_writepages,
.dirty_folio = nilfs_dirty_folio,
.readahead = nilfs_readahead,
.write_begin = nilfs_write_begin,
.write_end = nilfs_write_end,
- /* .releasepage = nilfs_releasepage, */
.invalidate_folio = block_invalidate_folio,
.direct_IO = nilfs_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1088,6 +1086,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
/**
* nilfs_dirty_inode - reflect changes on given inode to an inode block.
* @inode: inode of the file to be registered.
+ * @flags: flags to determine the dirty state of the inode
*
* nilfs_dirty_inode() loads a inode block containing the specified
* @inode and copies data from a nilfs_inode to a corresponding inode
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fec194a666f4..87e1004b606d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1052,20 +1052,20 @@ out:
static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
{
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
- struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
struct fstrim_range range;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(nilfs->ns_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
+ range.minlen = max_t(u64, range.minlen,
+ bdev_discard_granularity(nilfs->ns_bdev));
down_read(&nilfs->ns_segctor_sem);
ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index d29a0f2b9c16..cbf4fa60eea2 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -111,8 +111,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
}
static int
-nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
- int mode, int mode_flags, struct buffer_head **out_bh)
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf,
+ struct buffer_head **out_bh)
{
struct buffer_head *bh;
__u64 blknum = 0;
@@ -126,12 +126,12 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
if (buffer_uptodate(bh))
goto out;
- if (mode_flags & REQ_RAHEAD) {
+ if (opf & REQ_RAHEAD) {
if (!trylock_buffer(bh)) {
ret = -EBUSY;
goto failed_bh;
}
- } else /* mode == READ */
+ } else /* opf == REQ_OP_READ */
lock_buffer(bh);
if (buffer_uptodate(bh)) {
@@ -148,10 +148,11 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(mode, mode_flags, bh);
+ submit_bh(opf, bh);
ret = 0;
- trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
+ trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff,
+ opf & REQ_OP_MASK);
out:
get_bh(bh);
*out_bh = bh;
@@ -172,7 +173,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
int err;
- err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh);
+ err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, &first_bh);
if (err == -EEXIST) /* internal code */
goto out;
@@ -182,8 +183,8 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
if (readahead) {
blkoff = block + 1;
for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
- err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ,
- REQ_RAHEAD, &bh);
+ err = nilfs_mdt_submit_block(inode, blkoff,
+ REQ_OP_READ | REQ_RAHEAD, &bh);
if (likely(!err || err == -EEXIST))
brelse(bh);
else if (err != -EBUSY)
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 1344f7d475d3..aecda4fc95f5 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -198,6 +198,9 @@ static inline int nilfs_acl_chmod(struct inode *inode)
static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
{
+ if (S_ISLNK(inode->i_mode))
+ return 0;
+
inode->i_mode &= ~current_umask();
return 0;
}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a8e88cc38e16..3267e96c256c 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -294,57 +294,57 @@ repeat:
void nilfs_copy_back_pages(struct address_space *dmap,
struct address_space *smap)
{
- struct pagevec pvec;
+ struct folio_batch fbatch;
unsigned int i, n;
- pgoff_t index = 0;
+ pgoff_t start = 0;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
repeat:
- n = pagevec_lookup(&pvec, smap, &index);
+ n = filemap_get_folios(smap, &start, ~0UL, &fbatch);
if (!n)
return;
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i], *dpage;
- pgoff_t offset = page->index;
-
- lock_page(page);
- dpage = find_lock_page(dmap, offset);
- if (dpage) {
- /* overwrite existing page in the destination cache */
- WARN_ON(PageDirty(dpage));
- nilfs_copy_page(dpage, page, 0);
- unlock_page(dpage);
- put_page(dpage);
- /* Do we not need to remove page from smap here? */
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i], *dfolio;
+ pgoff_t index = folio->index;
+
+ folio_lock(folio);
+ dfolio = filemap_lock_folio(dmap, index);
+ if (dfolio) {
+ /* overwrite existing folio in the destination cache */
+ WARN_ON(folio_test_dirty(dfolio));
+ nilfs_copy_page(&dfolio->page, &folio->page, 0);
+ folio_unlock(dfolio);
+ folio_put(dfolio);
+ /* Do we not need to remove folio from smap here? */
} else {
- struct page *p;
+ struct folio *f;
- /* move the page to the destination cache */
+ /* move the folio to the destination cache */
xa_lock_irq(&smap->i_pages);
- p = __xa_erase(&smap->i_pages, offset);
- WARN_ON(page != p);
+ f = __xa_erase(&smap->i_pages, index);
+ WARN_ON(folio != f);
smap->nrpages--;
xa_unlock_irq(&smap->i_pages);
xa_lock_irq(&dmap->i_pages);
- p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS);
- if (unlikely(p)) {
+ f = __xa_store(&dmap->i_pages, index, folio, GFP_NOFS);
+ if (unlikely(f)) {
/* Probably -ENOMEM */
- page->mapping = NULL;
- put_page(page);
+ folio->mapping = NULL;
+ folio_put(folio);
} else {
- page->mapping = dmap;
+ folio->mapping = dmap;
dmap->nrpages++;
- if (PageDirty(page))
- __xa_set_mark(&dmap->i_pages, offset,
+ if (folio_test_dirty(folio))
+ __xa_set_mark(&dmap->i_pages, index,
PAGECACHE_TAG_DIRTY);
}
xa_unlock_irq(&dmap->i_pages);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
goto repeat;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 9e2ed76c0f25..0955b657938f 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -511,7 +511,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
pos = rb->blkoff << inode->i_blkbits;
err = block_write_begin(inode->i_mapping, pos, blocksize,
- 0, &page, nilfs_get_block);
+ &page, nilfs_get_block);
if (unlikely(err)) {
loff_t isize = inode->i_size;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index e385cca2004a..77ff8e95421f 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1100,7 +1100,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (ret < 0) {
put_bh(su_bh);
goto out_sem;
@@ -1134,7 +1134,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (!ret)
ndiscarded += nblocks;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index dd48a8f74d57..3b4a079c9617 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -672,7 +672,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (ret < 0)
return ret;
nblocks = 0;
@@ -682,7 +682,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, 0);
+ GFP_NOFS);
return ret;
}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 829dd4a61b66..190aa717fa32 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -168,7 +168,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
- mutex_lock(&dnotify_group->mark_mutex);
+ fsnotify_group_lock(dnotify_group);
spin_lock(&fsn_mark->lock);
prev = &dn_mark->dn;
@@ -191,7 +191,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
free = true;
}
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
if (free)
fsnotify_free_mark(fsn_mark);
@@ -324,7 +324,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
new_dn_mark->dn = NULL;
/* this is needed to prevent the fcntl/close race described below */
- mutex_lock(&dnotify_group->mark_mutex);
+ fsnotify_group_lock(dnotify_group);
/* add the new_fsn_mark or find an old one. */
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
@@ -334,7 +334,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
} else {
error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
if (error) {
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
goto out_err;
}
spin_lock(&new_fsn_mark->lock);
@@ -383,7 +383,7 @@ out:
if (destroy)
fsnotify_detach_mark(fsn_mark);
- mutex_unlock(&dnotify_group->mark_mutex);
+ fsnotify_group_unlock(dnotify_group);
if (destroy)
fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
@@ -401,7 +401,8 @@ static int __init dnotify_init(void)
SLAB_PANIC|SLAB_ACCOUNT);
dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
- dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
+ dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops,
+ FSNOTIFY_GROUP_NOFS);
if (IS_ERR(dnotify_group))
panic("unable to allocate fsnotify group for dnotify\n");
dnotify_sysctl_init();
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 985e995d2a39..a2a15bc4df28 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -18,7 +18,7 @@
#include "fanotify.h"
-static bool fanotify_path_equal(struct path *p1, struct path *p2)
+static bool fanotify_path_equal(const struct path *p1, const struct path *p2)
{
return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
}
@@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
const void *data, int data_type,
struct inode *dir)
{
- __u32 marks_mask = 0, marks_ignored_mask = 0;
+ __u32 marks_mask = 0, marks_ignore_mask = 0;
__u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS |
FANOTIFY_EVENT_FLAGS;
const struct path *path = fsnotify_data_path(data, data_type);
unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
struct fsnotify_mark *mark;
+ bool ondir = event_mask & FAN_ONDIR;
int type;
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
@@ -315,31 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
return 0;
} else if (!(fid_mode & FAN_REPORT_FID)) {
/* Do we have a directory inode to report? */
- if (!dir && !(event_mask & FS_ISDIR))
+ if (!dir && !ondir)
return 0;
}
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
-
- /* Apply ignore mask regardless of ISDIR and ON_CHILD flags */
- marks_ignored_mask |= mark->ignored_mask;
-
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
/*
- * If the event is on dir and this mark doesn't care about
- * events on dir, don't send it!
+ * Apply ignore mask depending on event flags in ignore mask.
*/
- if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
- continue;
+ marks_ignore_mask |=
+ fsnotify_effective_ignore_mask(mark, ondir, type);
/*
- * If the event is on a child and this mark is on a parent not
- * watching children, don't send it!
+ * Send the event depending on event flags in mark mask.
*/
- if (type == FSNOTIFY_ITER_TYPE_PARENT &&
- !(mark->mask & FS_EVENT_ON_CHILD))
+ if (!fsnotify_mask_applicable(mark->mask, ondir, type))
continue;
marks_mask |= mark->mask;
@@ -348,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
*match_mask |= 1U << type;
}
- test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+ test_mask = event_mask & marks_mask & ~marks_ignore_mask;
/*
* For dirent modification events (create/delete/move) that do not carry
@@ -849,16 +840,14 @@ out:
*/
static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *mark;
int type;
__kernel_fsid_t fsid = {};
- fsnotify_foreach_iter_type(type) {
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
struct fsnotify_mark_connector *conn;
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
-
- conn = READ_ONCE(iter_info->marks[type]->connector);
+ conn = READ_ONCE(mark->connector);
/* Mark is just getting destroyed or created? */
if (!conn)
continue;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index a3d5b751cac5..57f51a9a3015 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -452,13 +452,7 @@ static inline bool fanotify_is_error_event(u32 mask)
return mask & FAN_FS_ERROR;
}
-static inline bool fanotify_event_has_path(struct fanotify_event *event)
-{
- return event->type == FANOTIFY_EVENT_TYPE_PATH ||
- event->type == FANOTIFY_EVENT_TYPE_PATH_PERM;
-}
-
-static inline struct path *fanotify_event_path(struct fanotify_event *event)
+static inline const struct path *fanotify_event_path(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_PATH)
return &FANOTIFY_PE(event)->path;
@@ -490,3 +484,17 @@ static inline unsigned int fanotify_event_hash_bucket(
{
return event->hash & FANOTIFY_HTABLE_MASK;
}
+
+static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
+{
+ unsigned int mflags = 0;
+
+ if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+ mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+ if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
+ mflags |= FAN_MARK_EVICTABLE;
+ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+ mflags |= FAN_MARK_IGNORE;
+
+ return mflags;
+}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9b32b76a9c30..4546da4a54f9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -249,7 +249,7 @@ out:
return event;
}
-static int create_fd(struct fsnotify_group *group, struct path *path,
+static int create_fd(struct fsnotify_group *group, const struct path *path,
struct file **file)
{
int client_fd;
@@ -264,7 +264,7 @@ static int create_fd(struct fsnotify_group *group, struct path *path,
* originally opened O_WRONLY.
*/
new_file = dentry_open(path,
- group->fanotify_data.f_flags | FMODE_NONOTIFY,
+ group->fanotify_data.f_flags | __FMODE_NONOTIFY,
current_cred());
if (IS_ERR(new_file)) {
/*
@@ -619,7 +619,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
char __user *buf, size_t count)
{
struct fanotify_event_metadata metadata;
- struct path *path = fanotify_event_path(event);
+ const struct path *path = fanotify_event_path(event);
struct fanotify_info *info = fanotify_event_info(event);
unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
@@ -1009,10 +1009,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
mask &= ~umask;
spin_lock(&fsn_mark->lock);
oldmask = fsnotify_calc_mask(fsn_mark);
- if (!(flags & FAN_MARK_IGNORED_MASK)) {
+ if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
fsn_mark->mask &= ~mask;
} else {
- fsn_mark->ignored_mask &= ~mask;
+ fsn_mark->ignore_mask &= ~mask;
}
newmask = fsnotify_calc_mask(fsn_mark);
/*
@@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
* changes to the mask.
* Destroy mark when only umask bits remain.
*/
- *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
+ *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
spin_unlock(&fsn_mark->lock);
return oldmask & ~newmask;
@@ -1035,10 +1035,10 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
__u32 removed;
int destroy_mark;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return -ENOENT;
}
@@ -1048,7 +1048,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
fsnotify_recalc_mask(fsn_mark->connector);
if (destroy_mark)
fsnotify_detach_mark(fsn_mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
if (destroy_mark)
fsnotify_free_mark(fsn_mark);
@@ -1081,47 +1081,72 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
flags, umask);
}
-static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask, unsigned int flags,
- __u32 *removed)
+static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
+ unsigned int fan_flags)
{
- fsn_mark->ignored_mask |= mask;
+ bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
+ unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
+ bool recalc = false;
+
+ /*
+ * When using FAN_MARK_IGNORE for the first time, mark starts using
+ * independent event flags in ignore mask. After that, trying to
+ * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
+ * will result in EEXIST error.
+ */
+ if (ignore == FAN_MARK_IGNORE)
+ fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
/*
* Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
* the removal of the FS_MODIFY bit in calculated mask if it was set
- * because of an ignored mask that is now going to survive FS_MODIFY.
+ * because of an ignore mask that is now going to survive FS_MODIFY.
*/
- if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
!(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
if (!(fsn_mark->mask & FS_MODIFY))
- *removed = FS_MODIFY;
+ recalc = true;
}
+
+ if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
+ want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ return recalc;
+
+ /*
+ * NO_IREF may be removed from a mark, but not added.
+ * When removed, fsnotify_recalc_mask() will take the inode ref.
+ */
+ WARN_ON_ONCE(!want_iref);
+ fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
+
+ return true;
}
-static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask, unsigned int flags,
- __u32 *removed)
+static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+ __u32 mask, unsigned int fan_flags)
{
- __u32 oldmask, newmask;
+ bool recalc;
spin_lock(&fsn_mark->lock);
- oldmask = fsnotify_calc_mask(fsn_mark);
- if (!(flags & FAN_MARK_IGNORED_MASK)) {
+ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
fsn_mark->mask |= mask;
- } else {
- fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed);
- }
- newmask = fsnotify_calc_mask(fsn_mark);
+ else
+ fsn_mark->ignore_mask |= mask;
+
+ recalc = fsnotify_calc_mask(fsn_mark) &
+ ~fsnotify_conn_mask(fsn_mark->connector);
+
+ recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
spin_unlock(&fsn_mark->lock);
- return newmask & ~oldmask;
+ return recalc;
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
unsigned int obj_type,
+ unsigned int fan_flags,
__kernel_fsid_t *fsid)
{
struct ucounts *ucounts = group->fanotify_data.ucounts;
@@ -1144,6 +1169,9 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
}
fsnotify_init_mark(mark, group);
+ if (fan_flags & FAN_MARK_EVICTABLE)
+ mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
+
ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
@@ -1168,41 +1196,81 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
sizeof(struct fanotify_error_event));
}
+static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
+ unsigned int fan_flags)
+{
+ /*
+ * Non evictable mark cannot be downgraded to evictable mark.
+ */
+ if (fan_flags & FAN_MARK_EVICTABLE &&
+ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ return -EEXIST;
+
+ /*
+ * New ignore mask semantics cannot be downgraded to old semantics.
+ */
+ if (fan_flags & FAN_MARK_IGNORED_MASK &&
+ fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+ return -EEXIST;
+
+ /*
+ * An ignore mask that survives modify could never be downgraded to not
+ * survive modify. With new FAN_MARK_IGNORE semantics we make that rule
+ * explicit and return an error when trying to update the ignore mask
+ * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
+ */
+ if (fan_flags & FAN_MARK_IGNORE &&
+ !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+ return -EEXIST;
+
+ return 0;
+}
+
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int obj_type,
- __u32 mask, unsigned int flags,
+ __u32 mask, unsigned int fan_flags,
__kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
- __u32 added, removed = 0;
+ bool recalc;
int ret = 0;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
fsn_mark = fsnotify_find_mark(connp, group);
if (!fsn_mark) {
- fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
+ fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
+ fan_flags, fsid);
if (IS_ERR(fsn_mark)) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return PTR_ERR(fsn_mark);
}
}
/*
+ * Check if requested mark flags conflict with an existing mark flags.
+ */
+ ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
+ if (ret)
+ goto out;
+
+ /*
* Error events are pre-allocated per group, only if strictly
* needed (i.e. FAN_FS_ERROR was requested).
*/
- if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
+ (mask & FAN_FS_ERROR)) {
ret = fanotify_group_init_error_pool(group);
if (ret)
goto out;
}
- added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed);
- if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector)))
+ recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
+ if (recalc)
fsnotify_recalc_mask(fsn_mark->connector);
out:
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_put_mark(fsn_mark);
return ret;
@@ -1232,10 +1300,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
/*
* If some other task has this inode open for write we should not add
- * an ignored mark, unless that ignored mark is supposed to survive
+ * an ignore mask, unless that ignore mask is supposed to survive
* modification changes anyway.
*/
- if ((flags & FAN_MARK_IGNORED_MASK) &&
+ if ((flags & FANOTIFY_MARK_IGNORE_BITS) &&
!(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
inode_is_open_for_write(inode))
return 0;
@@ -1348,14 +1416,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
(!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
return -EINVAL;
- f_flags = O_RDWR | FMODE_NONOTIFY;
+ f_flags = O_RDWR | __FMODE_NONOTIFY;
if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC;
if (flags & FAN_NONBLOCK)
f_flags |= O_NONBLOCK;
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
- group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
+ group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
+ FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS);
if (IS_ERR(group)) {
return PTR_ERR(group);
}
@@ -1483,8 +1552,16 @@ static int fanotify_test_fid(struct dentry *dentry)
return 0;
}
-static int fanotify_events_supported(struct path *path, __u64 mask)
+static int fanotify_events_supported(struct fsnotify_group *group,
+ const struct path *path, __u64 mask,
+ unsigned int flags)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ /* Strict validation of events in non-dir inode mask with v5.17+ APIs */
+ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
+ (mask & FAN_RENAME) ||
+ (flags & FAN_MARK_IGNORE);
+
/*
* Some filesystems such as 'proc' acquire unusual locks when opening
* files. For them fanotify permission events have high chances of
@@ -1496,6 +1573,16 @@ static int fanotify_events_supported(struct path *path, __u64 mask)
if (mask & FANOTIFY_PERM_EVENTS &&
path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
return -EINVAL;
+
+ /*
+ * We shouldn't have allowed setting dirent events and the directory
+ * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
+ * but because we always allowed it, error only when using new APIs.
+ */
+ if (strict_dir_events && mark_type == FAN_MARK_INODE &&
+ !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
+ return -ENOTDIR;
+
return 0;
}
@@ -1510,7 +1597,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
__kernel_fsid_t __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
- bool ignored = flags & FAN_MARK_IGNORED_MASK;
+ unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
+ unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
unsigned int obj_type, fid_mode;
u32 umask = 0;
int ret;
@@ -1539,7 +1627,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
}
- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+ switch (mark_cmd) {
case FAN_MARK_ADD:
case FAN_MARK_REMOVE:
if (!mask)
@@ -1559,9 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & ~valid_mask)
return -EINVAL;
- /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
- if (ignored)
+
+ /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
+ if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
+ return -EINVAL;
+
+ /*
+ * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
+ * FAN_MARK_IGNORED_MASK.
+ */
+ if (ignore == FAN_MARK_IGNORED_MASK) {
mask &= ~FANOTIFY_EVENT_FLAGS;
+ umask = FANOTIFY_EVENT_FLAGS;
+ }
f = fdget(fanotify_fd);
if (unlikely(!f.file))
@@ -1598,6 +1696,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
/*
+ * Evictable is only relevant for inode marks, because only inode object
+ * can be evicted on memory pressure.
+ */
+ if (flags & FAN_MARK_EVICTABLE &&
+ mark_type != FAN_MARK_INODE)
+ goto fput_and_out;
+
+ /*
* Events that do not carry enough information to report
* event->fd require a group that supports reporting fid. Those
* events are not supported on a mount mark, because they do not
@@ -1617,7 +1723,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
goto fput_and_out;
- if (flags & FAN_MARK_FLUSH) {
+ if (mark_cmd == FAN_MARK_FLUSH) {
ret = 0;
if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
@@ -1633,8 +1739,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto fput_and_out;
- if (flags & FAN_MARK_ADD) {
- ret = fanotify_events_supported(&path, mask);
+ if (mark_cmd == FAN_MARK_ADD) {
+ ret = fanotify_events_supported(group, &path, mask, flags);
if (ret)
goto path_put_and_out;
}
@@ -1657,6 +1763,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
else
mnt = path.mnt;
+ ret = mnt ? -EINVAL : -EISDIR;
+ /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
+ if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE &&
+ (mnt || S_ISDIR(inode->i_mode)) &&
+ !(flags & FAN_MARK_IGNORED_SURV_MODIFY))
+ goto path_put_and_out;
+
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
@@ -1666,12 +1779,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* events with parent/name info for non-directory.
*/
if ((fid_mode & FAN_REPORT_DIR_FID) &&
- (flags & FAN_MARK_ADD) && !ignored)
+ (flags & FAN_MARK_ADD) && !ignore)
mask |= FAN_EVENT_ON_CHILD;
}
/* create/update an inode mark */
- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
+ switch (mark_cmd) {
case FAN_MARK_ADD:
if (mark_type == FAN_MARK_MOUNT)
ret = fanotify_add_vfsmount_mark(group, mnt, mask,
@@ -1749,7 +1862,7 @@ static int __init fanotify_user_setup(void)
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 57f0d5d9f934..55081ae3a6ec 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -14,6 +14,7 @@
#include <linux/exportfs.h>
#include "inotify/inotify.h"
+#include "fanotify/fanotify.h"
#include "fdinfo.h"
#include "fsnotify.h"
@@ -28,13 +29,13 @@ static void show_fdinfo(struct seq_file *m, struct file *f,
struct fsnotify_group *group = f->private_data;
struct fsnotify_mark *mark;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
list_for_each_entry(mark, &group->marks_list, g_list) {
show(m, mark);
if (seq_has_overflowed(m))
break;
}
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
}
#if defined(CONFIG_EXPORTFS)
@@ -83,16 +84,9 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(fsnotify_conn_inode(mark->connector));
if (inode) {
- /*
- * IN_ALL_EVENTS represents all of the mask bits
- * that we expose to userspace. There is at
- * least one bit (FS_EVENT_ON_CHILD) which is
- * used only internally to the kernel.
- */
- u32 mask = mark->mask & IN_ALL_EVENTS;
- seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+ seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mask, mark->ignored_mask);
+ inotify_mark_user_mask(mark));
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
@@ -110,19 +104,16 @@ void inotify_show_fdinfo(struct seq_file *m, struct file *f)
static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
{
- unsigned int mflags = 0;
+ unsigned int mflags = fanotify_mark_user_flags(mark);
struct inode *inode;
- if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
- mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
-
if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = igrab(fsnotify_conn_inode(mark->connector));
if (!inode)
return;
seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
inode->i_ino, inode->i_sb->s_dev,
- mflags, mark->mask, mark->ignored_mask);
+ mflags, mark->mask, mark->ignore_mask);
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
@@ -130,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct mount *mnt = fsnotify_conn_mount(mark->connector);
seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
- mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
+ mnt->mnt_id, mflags, mark->mask, mark->ignore_mask);
} else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) {
struct super_block *sb = fsnotify_conn_sb(mark->connector);
seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
- sb->s_dev, mflags, mark->mask, mark->ignored_mask);
+ sb->s_dev, mflags, mark->mask, mark->ignore_mask);
}
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 70a8516b78bc..7974e91ffe13 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -100,7 +100,7 @@ void fsnotify_sb_delete(struct super_block *sb)
* Given an inode, first check if we care what happens to our children. Inotify
* and dnotify both tell their parents about events. If we care about any event
* on a child we run all of our children and set a dentry flag saying that the
- * parent cares. Thus when an event happens on a child it can quickly tell if
+ * parent cares. Thus when an event happens on a child it can quickly tell
* if there is a need to find a parent and send the event to the parent.
*/
void __fsnotify_update_child_dentry_flags(struct inode *inode)
@@ -253,7 +253,7 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
if (WARN_ON_ONCE(!inode && !dir))
return 0;
- if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+ if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
path && d_unlinked(path->dentry))
return 0;
@@ -290,22 +290,15 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
}
if (parent_mark) {
- /*
- * parent_mark indicates that the parent inode is watching
- * children and interested in this event, which is an event
- * possible on child. But is *this mark* watching children and
- * interested in this event?
- */
- if (parent_mark->mask & FS_EVENT_ON_CHILD) {
- ret = fsnotify_handle_inode_event(group, parent_mark, mask,
- data, data_type, dir, name, 0);
- if (ret)
- return ret;
- }
- if (!inode_mark)
- return 0;
+ ret = fsnotify_handle_inode_event(group, parent_mark, mask,
+ data, data_type, dir, name, 0);
+ if (ret)
+ return ret;
}
+ if (!inode_mark)
+ return 0;
+
if (mask & FS_EVENT_ON_CHILD) {
/*
* Some events can be sent on both parent dir and child marks
@@ -331,42 +324,36 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
struct fsnotify_group *group = NULL;
__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
__u32 marks_mask = 0;
- __u32 marks_ignored_mask = 0;
+ __u32 marks_ignore_mask = 0;
+ bool is_dir = mask & FS_ISDIR;
struct fsnotify_mark *mark;
int type;
- if (WARN_ON(!iter_info->report_mask))
+ if (!iter_info->report_mask)
return 0;
/* clear ignored on inode modification */
if (mask & FS_MODIFY) {
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
- if (mark &&
- !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
- mark->ignored_mask = 0;
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ if (!(mark->flags &
+ FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+ mark->ignore_mask = 0;
}
}
- fsnotify_foreach_iter_type(type) {
- if (!fsnotify_iter_should_report_type(iter_info, type))
- continue;
- mark = iter_info->marks[type];
- /* does the object mark tell us to do something? */
- if (mark) {
- group = mark->group;
- marks_mask |= mark->mask;
- marks_ignored_mask |= mark->ignored_mask;
- }
+ /* Are any of the group marks interested in this event? */
+ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ group = mark->group;
+ marks_mask |= mark->mask;
+ marks_ignore_mask |=
+ fsnotify_effective_ignore_mask(mark, is_dir, type);
}
- pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
- __func__, group, mask, marks_mask, marks_ignored_mask,
+ pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
+ __func__, group, mask, marks_mask, marks_ignore_mask,
data, data_type, dir, cookie);
- if (!(test_mask & marks_mask & ~marks_ignored_mask))
+ if (!(test_mask & marks_mask & ~marks_ignore_mask))
return 0;
if (group->ops->handle_event) {
@@ -403,11 +390,11 @@ static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
/*
* iter_info is a multi head priority queue of marks.
- * Pick a subset of marks from queue heads, all with the
- * same group and set the report_mask for selected subset.
- * Returns the report_mask of the selected subset.
+ * Pick a subset of marks from queue heads, all with the same group
+ * and set the report_mask to a subset of the selected marks.
+ * Returns false if there are no more groups to iterate.
*/
-static unsigned int fsnotify_iter_select_report_types(
+static bool fsnotify_iter_select_report_types(
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *max_prio_group = NULL;
@@ -423,30 +410,49 @@ static unsigned int fsnotify_iter_select_report_types(
}
if (!max_prio_group)
- return 0;
+ return false;
/* Set the report mask for marks from same group as max prio group */
+ iter_info->current_group = max_prio_group;
iter_info->report_mask = 0;
fsnotify_foreach_iter_type(type) {
mark = iter_info->marks[type];
- if (mark &&
- fsnotify_compare_groups(max_prio_group, mark->group) == 0)
+ if (mark && mark->group == iter_info->current_group) {
+ /*
+ * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
+ * is watching children and interested in this event,
+ * which is an event possible on child.
+ * But is *this mark* watching children?
+ */
+ if (type == FSNOTIFY_ITER_TYPE_PARENT &&
+ !(mark->mask & FS_EVENT_ON_CHILD) &&
+ !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
+ continue;
+
fsnotify_iter_set_report_type(iter_info, type);
+ }
}
- return iter_info->report_mask;
+ return true;
}
/*
- * Pop from iter_info multi head queue, the marks that were iterated in the
+ * Pop from iter_info multi head queue, the marks that belong to the group of
* current iteration step.
*/
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
+ struct fsnotify_mark *mark;
int type;
+ /*
+ * We cannot use fsnotify_foreach_iter_mark_type() here because we
+ * may need to advance a mark of type X that belongs to current_group
+ * but was not selected for reporting.
+ */
fsnotify_foreach_iter_type(type) {
- if (fsnotify_iter_should_report_type(iter_info, type))
+ mark = iter_info->marks[type];
+ if (mark && mark->group == iter_info->current_group)
iter_info->marks[type] =
fsnotify_next_mark(iter_info->marks[type]);
}
@@ -529,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
/*
- * If this is a modify event we may need to clear some ignored masks.
- * In that case, the object with ignored masks will have the FS_MODIFY
+ * If this is a modify event we may need to clear some ignore masks.
+ * In that case, the object with ignore masks will have the FS_MODIFY
* event in its mask.
* Otherwise, return if none of the marks care about this type of event.
*/
@@ -581,7 +587,7 @@ static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 87d8a50ee803..fde74eb333cc 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -76,10 +76,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
*/
extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
-/* allocate and destroy and event holder to attach events to notification/access queues */
-extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
-extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
-
extern struct kmem_cache *fsnotify_mark_connector_cachep;
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b7d4d64f87c2..1de6631a3925 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -112,8 +112,10 @@ void fsnotify_put_group(struct fsnotify_group *group)
EXPORT_SYMBOL_GPL(fsnotify_put_group);
static struct fsnotify_group *__fsnotify_alloc_group(
- const struct fsnotify_ops *ops, gfp_t gfp)
+ const struct fsnotify_ops *ops,
+ int flags, gfp_t gfp)
{
+ static struct lock_class_key nofs_marks_lock;
struct fsnotify_group *group;
group = kzalloc(sizeof(struct fsnotify_group), gfp);
@@ -133,6 +135,17 @@ static struct fsnotify_group *__fsnotify_alloc_group(
INIT_LIST_HEAD(&group->marks_list);
group->ops = ops;
+ group->flags = flags;
+ /*
+ * For most backends, eviction of inode with a mark is not expected,
+ * because marks hold a refcount on the inode against eviction.
+ *
+ * Use a different lockdep class for groups that support evictable
+ * inode marks, because with evictable marks, mark_mutex is NOT
+ * fs-reclaim safe - the mutex is taken when evicting inodes.
+ */
+ if (flags & FSNOTIFY_GROUP_NOFS)
+ lockdep_set_class(&group->mark_mutex, &nofs_marks_lock);
return group;
}
@@ -140,20 +153,15 @@ static struct fsnotify_group *__fsnotify_alloc_group(
/*
* Create a new fsnotify_group and hold a reference for the group returned.
*/
-struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops,
+ int flags)
{
- return __fsnotify_alloc_group(ops, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
+ gfp_t gfp = (flags & FSNOTIFY_GROUP_USER) ? GFP_KERNEL_ACCOUNT :
+ GFP_KERNEL;
-/*
- * Create a new fsnotify_group and hold a reference for the group returned.
- */
-struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
-{
- return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
+ return __fsnotify_alloc_group(ops, flags, gfp);
}
-EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
int fsnotify_fasync(int fd, struct file *file, int on)
{
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 2007e3711916..7d5df7a21539 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -22,6 +22,25 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
return container_of(fse, struct inotify_event_info, fse);
}
+/*
+ * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to
+ * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+#define INOTIFY_USER_MASK (IN_ALL_EVENTS)
+
+static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
+{
+ __u32 mask = fsn_mark->mask & INOTIFY_USER_MASK;
+
+ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK)
+ mask |= IN_EXCL_UNLINK;
+ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
+ mask |= IN_ONESHOT;
+
+ return mask;
+}
+
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group);
extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d92d7b0adc9a..49cfe2ae6d23 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -122,7 +122,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
fsnotify_destroy_event(group, fsn_event);
}
- if (inode_mark->mask & IN_ONESHOT)
+ if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
fsnotify_destroy_mark(inode_mark, group);
return 0;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 54583f62dc44..1c4bfdab008d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -110,18 +110,33 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
mask |= FS_EVENT_ON_CHILD;
/* mask off the flags used to open the fd */
- mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+ mask |= (arg & INOTIFY_USER_MASK);
return mask;
}
+#define INOTIFY_MARK_FLAGS \
+ (FSNOTIFY_MARK_FLAG_EXCL_UNLINK | FSNOTIFY_MARK_FLAG_IN_ONESHOT)
+
+static inline unsigned int inotify_arg_to_flags(u32 arg)
+{
+ unsigned int flags = 0;
+
+ if (arg & IN_EXCL_UNLINK)
+ flags |= FSNOTIFY_MARK_FLAG_EXCL_UNLINK;
+ if (arg & IN_ONESHOT)
+ flags |= FSNOTIFY_MARK_FLAG_IN_ONESHOT;
+
+ return flags;
+}
+
static inline u32 inotify_mask_to_arg(__u32 mask)
{
return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
IN_Q_OVERFLOW);
}
-/* intofiy userspace file descriptor functions */
+/* inotify userspace file descriptor functions */
static __poll_t inotify_poll(struct file *file, poll_table *wait)
{
struct fsnotify_group *group = file->private_data;
@@ -526,13 +541,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
struct fsnotify_mark *fsn_mark;
struct inotify_inode_mark *i_mark;
__u32 old_mask, new_mask;
- __u32 mask;
- int add = (arg & IN_MASK_ADD);
+ int replace = !(arg & IN_MASK_ADD);
int create = (arg & IN_MASK_CREATE);
int ret;
- mask = inotify_arg_to_mask(inode, arg);
-
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
@@ -545,10 +557,12 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
spin_lock(&fsn_mark->lock);
old_mask = fsn_mark->mask;
- if (add)
- fsn_mark->mask |= mask;
- else
- fsn_mark->mask = mask;
+ if (replace) {
+ fsn_mark->mask = 0;
+ fsn_mark->flags &= ~INOTIFY_MARK_FLAGS;
+ }
+ fsn_mark->mask |= inotify_arg_to_mask(inode, arg);
+ fsn_mark->flags |= inotify_arg_to_flags(arg);
new_mask = fsn_mark->mask;
spin_unlock(&fsn_mark->lock);
@@ -579,19 +593,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
u32 arg)
{
struct inotify_inode_mark *tmp_i_mark;
- __u32 mask;
int ret;
struct idr *idr = &group->inotify_data.idr;
spinlock_t *idr_lock = &group->inotify_data.idr_lock;
- mask = inotify_arg_to_mask(inode, arg);
-
tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
if (unlikely(!tmp_i_mark))
return -ENOMEM;
fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
- tmp_i_mark->fsn_mark.mask = mask;
+ tmp_i_mark->fsn_mark.mask = inotify_arg_to_mask(inode, arg);
+ tmp_i_mark->fsn_mark.flags = inotify_arg_to_flags(arg);
tmp_i_mark->wd = -1;
ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
@@ -628,13 +640,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
{
int ret = 0;
- mutex_lock(&group->mark_mutex);
+ fsnotify_group_lock(group);
/* try to update and existing watch with the new arg */
ret = inotify_update_existing_watch(group, inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
ret = inotify_new_watch(group, inode, arg);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
return ret;
}
@@ -644,7 +656,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
struct fsnotify_group *group;
struct inotify_event_info *oevent;
- group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
+ group = fsnotify_alloc_group(&inotify_fsnotify_ops,
+ FSNOTIFY_GROUP_USER);
if (IS_ERR(group))
return group;
@@ -845,9 +858,7 @@ static int __init inotify_user_setup(void)
BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
- BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
- BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 4853184f7dde..c74ef947447d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -116,20 +116,64 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
return *fsnotify_conn_mask_p(conn);
}
-static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+static void fsnotify_get_inode_ref(struct inode *inode)
+{
+ ihold(inode);
+ atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+}
+
+/*
+ * Grab or drop inode reference for the connector if needed.
+ *
+ * When it's time to drop the reference, we only clear the HAS_IREF flag and
+ * return the inode object. fsnotify_drop_object() will be resonsible for doing
+ * iput() outside of spinlocks. This happens when last mark that wanted iref is
+ * detached.
+ */
+static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
+ bool want_iref)
+{
+ bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
+ struct inode *inode = NULL;
+
+ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
+ want_iref == has_iref)
+ return NULL;
+
+ if (want_iref) {
+ /* Pin inode if any mark wants inode refcount held */
+ fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
+ conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
+ } else {
+ /* Unpin inode after detach of last mark that wanted iref */
+ inode = fsnotify_conn_inode(conn);
+ conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
+ }
+
+ return inode;
+}
+
+static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
+ bool want_iref = false;
struct fsnotify_mark *mark;
assert_spin_locked(&conn->lock);
/* We can get detached connector here when inode is getting unlinked. */
if (!fsnotify_valid_obj_type(conn->type))
- return;
+ return NULL;
hlist_for_each_entry(mark, &conn->list, obj_list) {
- if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
- new_mask |= fsnotify_calc_mask(mark);
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
+ continue;
+ new_mask |= fsnotify_calc_mask(mark);
+ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
+ !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ want_iref = true;
}
*fsnotify_conn_mask_p(conn) = new_mask;
+
+ return fsnotify_update_iref(conn, want_iref);
}
/*
@@ -169,12 +213,6 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
}
}
-static void fsnotify_get_inode_ref(struct inode *inode)
-{
- ihold(inode);
- atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
-}
-
static void fsnotify_put_inode_ref(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -213,6 +251,10 @@ static void *fsnotify_detach_connector_from_object(
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
+
+ /* Unpin inode when detaching from connector */
+ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
+ inode = NULL;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
@@ -274,7 +316,8 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
- __fsnotify_recalc_mask(conn);
+ objp = __fsnotify_recalc_mask(conn);
+ type = conn->type;
}
WRITE_ONCE(mark->connector, NULL);
spin_unlock(&conn->lock);
@@ -398,9 +441,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
*/
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
- struct fsnotify_group *group = mark->group;
-
- WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
+ fsnotify_group_assert_locked(mark->group);
WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
refcount_read(&mark->refcnt) < 1 +
!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
@@ -452,9 +493,9 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
@@ -499,7 +540,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
unsigned int obj_type,
__kernel_fsid_t *fsid)
{
- struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
@@ -507,6 +547,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
return -ENOMEM;
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
+ conn->flags = 0;
conn->type = obj_type;
conn->obj = connp;
/* Cache fsid of filesystem containing the object */
@@ -517,10 +558,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
conn->fsid.val[0] = conn->fsid.val[1] = 0;
conn->flags = 0;
}
- if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
- inode = fsnotify_conn_inode(conn);
- fsnotify_get_inode_ref(inode);
- }
fsnotify_get_sb_connectors(conn);
/*
@@ -529,8 +566,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
- if (inode)
- fsnotify_put_inode_ref(inode);
fsnotify_put_sb_connectors(conn);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
@@ -574,7 +609,7 @@ out:
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp,
unsigned int obj_type,
- int allow_dups, __kernel_fsid_t *fsid)
+ int add_flags, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -633,7 +668,7 @@ restart:
if ((lmark->group == mark->group) &&
(lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
- !allow_dups) {
+ !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
err = -EEXIST;
goto out_err;
}
@@ -668,12 +703,12 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int obj_type,
- int allow_dups, __kernel_fsid_t *fsid)
+ int add_flags, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
+ fsnotify_group_assert_locked(group);
/*
* LOCKING ORDER!!!!
@@ -688,12 +723,11 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, obj_type, allow_dups, fsid);
+ ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
if (ret)
goto err;
- if (mark->mask || mark->ignored_mask)
- fsnotify_recalc_mask(mark->connector);
+ fsnotify_recalc_mask(mark->connector);
return ret;
err:
@@ -708,15 +742,15 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int obj_type, int allow_dups,
+ unsigned int obj_type, int add_flags,
__kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
- mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, allow_dups, fsid);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_lock(group);
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
+ fsnotify_group_unlock(group);
return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);
@@ -770,24 +804,24 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
* move marks to free to to_free list in one go and then free marks in
* to_free list one by one.
*/
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
if (mark->connector->type == obj_type)
list_move(&mark->g_list, &to_free);
}
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
clear:
while (1) {
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ fsnotify_group_lock(group);
if (list_empty(head)) {
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
break;
}
mark = list_first_entry(head, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
+ fsnotify_group_unlock(group);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 800c1d0eb0d0..3506f6074288 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -28,7 +28,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
- return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+ return dynamic_dname(buffer, buflen, "%s:[%lu]",
ns_ops->name, inode->i_ino);
}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 90e3dad8ee45..9364d35b4a10 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -159,7 +159,7 @@ still_busy:
*
* Return 0 on success and -errno on error.
*
- * Contains an adapted version of fs/buffer.c::block_read_full_page().
+ * Contains an adapted version of fs/buffer.c::block_read_full_folio().
*/
static int ntfs_read_block(struct page *page)
{
@@ -342,7 +342,7 @@ handle_zblock:
for (i = 0; i < nr; i++) {
tbh = arr[i];
if (likely(!buffer_uptodate(tbh)))
- submit_bh(REQ_OP_READ, 0, tbh);
+ submit_bh(REQ_OP_READ, tbh);
else
ntfs_end_buffer_async_read(tbh, 1);
}
@@ -358,16 +358,16 @@ handle_zblock:
}
/**
- * ntfs_readpage - fill a @page of a @file with data from the device
- * @file: open file to which the page @page belongs or NULL
- * @page: page cache page to fill with data
+ * ntfs_read_folio - fill a @folio of a @file with data from the device
+ * @file: open file to which the folio @folio belongs or NULL
+ * @folio: page cache folio to fill with data
*
- * For non-resident attributes, ntfs_readpage() fills the @page of the open
- * file @file by calling the ntfs version of the generic block_read_full_page()
+ * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
+ * file @file by calling the ntfs version of the generic block_read_full_folio()
* function, ntfs_read_block(), which in turn creates and reads in the buffers
- * associated with the page asynchronously.
+ * associated with the folio asynchronously.
*
- * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
+ * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
* data from the mft record (which at this stage is most likely in memory) and
* fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
* even if the mft record is not cached at this point in time, we need to wait
@@ -375,8 +375,9 @@ handle_zblock:
*
* Return 0 on success and -errno on error.
*/
-static int ntfs_readpage(struct file *file, struct page *page)
+static int ntfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
loff_t i_size;
struct inode *vi;
ntfs_inode *ni, *base_ni;
@@ -458,7 +459,7 @@ retry_readpage:
}
/*
* If a parallel write made the attribute non-resident, drop the mft
- * record and retry the readpage.
+ * record and retry the read_folio.
*/
if (unlikely(NInoNonResident(ni))) {
unmap_mft_record(base_ni);
@@ -637,10 +638,11 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
if (unlikely((block >= iblock) &&
(initialized_size < i_size))) {
/*
- * If this page is fully outside initialized size, zero
- * out all pages between the current initialized size
- * and the current page. Just use ntfs_readpage() to do
- * the zeroing transparently.
+ * If this page is fully outside initialized
+ * size, zero out all pages between the current
+ * initialized size and the current page. Just
+ * use ntfs_read_folio() to do the zeroing
+ * transparently.
*/
if (block > iblock) {
// TODO:
@@ -798,7 +800,7 @@ lock_retry_remap:
/* For the error case, need to reset bh to the beginning. */
bh = head;
- /* Just an optimization, so ->readpage() is not called later. */
+ /* Just an optimization, so ->read_folio() is not called later. */
if (unlikely(!PageUptodate(page))) {
int uptodate = 1;
do {
@@ -857,7 +859,7 @@ lock_retry_remap:
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
need_end_writeback = false;
}
bh = next;
@@ -1185,7 +1187,7 @@ lock_retry_remap:
BUG_ON(!buffer_mapped(tbh));
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, 0, tbh);
+ submit_bh(REQ_OP_WRITE, tbh);
}
/* Synchronize the mft mirror now if not @sync. */
if (is_mft && !sync)
@@ -1329,7 +1331,7 @@ done:
* vfs inode dirty code path for the inode the mft record belongs to or via the
* vm page dirty code path for the page the mft record is in.
*
- * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
+ * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
*
* Return 0 on success and -errno on error.
*/
@@ -1651,13 +1653,13 @@ hole:
* attributes.
*/
const struct address_space_operations ntfs_normal_aops = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
.dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.bmap = ntfs_bmap,
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -1666,12 +1668,12 @@ const struct address_space_operations ntfs_normal_aops = {
* ntfs_compressed_aops - address space operations for compressed inodes
*/
const struct address_space_operations ntfs_compressed_aops = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
.dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -1681,12 +1683,12 @@ const struct address_space_operations ntfs_compressed_aops = {
* and attributes
*/
const struct address_space_operations ntfs_mst_aops = {
- .readpage = ntfs_readpage, /* Fill page with data. */
+ .read_folio = ntfs_read_folio, /* Fill page with data. */
#ifdef NTFS_RW
.writepage = ntfs_writepage, /* Write dirty page to disk. */
.dirty_folio = filemap_dirty_folio,
#endif /* NTFS_RW */
- .migratepage = buffer_migrate_page,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index f0962d46bd67..0cac5458c023 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -37,9 +37,9 @@ static inline void ntfs_unmap_page(struct page *page)
* Read a page from the page cache of the address space @mapping at position
* @index, where @index is in units of PAGE_SIZE, and not in bytes.
*
- * If the page is not in memory it is loaded from disk first using the readpage
- * method defined in the address space operations of @mapping and the page is
- * added to the page cache of @mapping in the process.
+ * If the page is not in memory it is loaded from disk first using the
+ * read_folio method defined in the address space operations of @mapping
+ * and the page is added to the page cache of @mapping in the process.
*
* If the page belongs to an mst protected attribute and it is marked as such
* in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
@@ -74,13 +74,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
{
struct page *page = read_mapping_page(mapping, index, NULL);
- if (!IS_ERR(page)) {
+ if (!IS_ERR(page))
kmap(page);
- if (!PageError(page))
- return page;
- ntfs_unmap_page(page);
- return ERR_PTR(-EIO);
- }
return page;
}
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 2911c04a33e0..52615e6090e1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
a = (ATTR_RECORD*)((u8*)ctx->attr +
le32_to_cpu(ctx->attr->length));
for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated))
+ u8 *mrec_end = (u8 *)ctx->mrec +
+ le32_to_cpu(ctx->mrec->bytes_allocated);
+ u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
+ name_end > mrec_end)
break;
ctx->attr = a;
if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
@@ -1719,7 +1723,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
vi->i_blocks = ni->allocated_size >> 9;
write_unlock_irqrestore(&ni->size_lock, flags);
/*
- * This needs to be last since the address space operations ->readpage
+ * This needs to be last since the address space operations ->read_folio
* and ->writepage can run concurrently with us as they are not
* serialized on i_mutex. Note, we are not allowed to fail once we flip
* this switch, which is another reason to do this last.
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index d2f9d6a0ee32..587e9b187873 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -658,7 +658,7 @@ lock_retry_remap:
}
get_bh(tbh);
tbh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, 0, tbh);
+ submit_bh(REQ_OP_READ, tbh);
}
/* Wait for io completion on all buffer heads. */
@@ -780,12 +780,12 @@ lock_retry_remap:
/* Uncompressed cb, copy it to the destination pages. */
/*
* TODO: As a big optimization, we could detect this case
- * before we read all the pages and use block_read_full_page()
+ * before we read all the pages and use block_read_full_folio()
* on all full pages instead (we still have to treat partial
* pages especially but at least we are getting rid of the
* synchronous io for the majority of pages.
* Or if we choose not to do the read-ahead/-behind stuff, we
- * could just return block_read_full_page(pages[xpage]) as long
+ * could just return block_read_full_folio(pages[xpage]) as long
* as PAGE_SIZE <= cb_size.
*/
if (cb_max_ofs)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 2ae25e48a41a..c481b14e4fd9 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -219,11 +219,6 @@ do_non_resident_extend:
err = PTR_ERR(page);
goto init_err_out;
}
- if (unlikely(PageError(page))) {
- put_page(page);
- err = -EIO;
- goto init_err_out;
- }
/*
* Update the initialized size in the ntfs inode. This is
* enough to make ntfs_writepage() work.
@@ -251,14 +246,14 @@ do_non_resident_extend:
*
* TODO: For sparse pages could optimize this workload by using
* the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
- * would be set in readpage for sparse pages and here we would
+ * would be set in read_folio for sparse pages and here we would
* not need to mark dirty any pages which have this bit set.
* The only caveat is that we have to clear the bit everywhere
* where we allocate any clusters that lie in the page or that
* contain the page.
*
* TODO: An even greater optimization would be for us to only
- * call readpage() on pages which are not in sparse regions as
+ * call read_folio() on pages which are not in sparse regions as
* determined from the runlist. This would greatly reduce the
* number of pages we read and make dirty in the case of sparse
* files.
@@ -532,12 +527,12 @@ err_out:
goto out;
}
-static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
+static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
{
lock_buffer(bh);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
- return submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
}
/**
@@ -1772,11 +1767,11 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
last_vcn = -1;
do {
VCN vcn;
- pgoff_t idx, start_idx;
+ pgoff_t start_idx;
unsigned ofs, do_pages, u;
size_t copied;
- start_idx = idx = pos >> PAGE_SHIFT;
+ start_idx = pos >> PAGE_SHIFT;
ofs = pos & ~PAGE_MASK;
bytes = PAGE_SIZE - ofs;
do_pages = 1;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index efe0602b4e51..db0f1995aedd 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1832,7 +1832,7 @@ int ntfs_read_inode_mount(struct inode *vi)
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
- /* Provides readpage() for map_mft_record(). */
+ /* Provides read_folio() for map_mft_record(). */
vi->i_mapping->a_ops = &ntfs_mst_aops;
ctx = ntfs_attr_get_search_ctx(ni, m);
@@ -2503,7 +2503,7 @@ retry_truncate:
* between the old data_size, i.e. old_size, and the new_size
* has not been zeroed. Fortunately, we do not need to zero it
* either since on one hand it will either already be zero due
- * to both readpage and writepage clearing partial page data
+ * to both read_folio and writepage clearing partial page data
* beyond i_size in which case there is nothing to do or in the
* case of the file being mmap()ped at the same time, POSIX
* specifies that the behaviour is unspecified thus we do not
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index bc1bf217b38e..6ce60ffc6ac0 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -807,7 +807,7 @@ map_vcn:
* completed ignore errors afterwards as we can assume
* that if one buffer worked all of them will work.
*/
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
if (should_wait) {
should_wait = false;
wait_on_buffer(bh);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 0d62cd5bb7f8..f7bf5ce960cc 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -583,7 +583,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
clear_buffer_dirty(tbh);
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, 0, tbh);
+ submit_bh(REQ_OP_WRITE, tbh);
}
/* Wait on i/o completion of buffers. */
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
@@ -780,7 +780,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
clear_buffer_dirty(tbh);
get_bh(tbh);
tbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, 0, tbh);
+ submit_bh(REQ_OP_WRITE, tbh);
}
/* Synchronize the mft mirror now if not @sync. */
if (!sync && ni->mft_no < vol->mftmirr_size)
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
index 17bfefc30271..49c001af16ed 100644
--- a/fs/ntfs/mft.h
+++ b/fs/ntfs/mft.h
@@ -79,7 +79,7 @@ extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
* paths and via the page cache write back code paths or between writing
* neighbouring mft records residing in the same page.
*
- * Locking the page also serializes us against ->readpage() if the page is not
+ * Locking the page also serializes us against ->read_folio() if the page is not
* uptodate.
*
* On success, clean the mft record and return 0. On error, leave the mft
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 5ae8de09b271..001f4e053c85 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2092,7 +2092,8 @@ get_ctx_vol_failed:
// TODO: Initialize security.
/* Get the extended system files' directory inode. */
vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+ if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
+ !S_ISDIR(vol->extend_ino->i_mode)) {
if (!IS_ERR(vol->extend_ino))
iput(vol->extend_ino);
ntfs_error(sb, "Failed to load $Extend.");
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index e8c00dda42ad..71f870d497ae 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -84,8 +84,8 @@ static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi,
/*
* attr_load_runs - Load all runs stored in @attr.
*/
-int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
- struct runs_tree *run, const CLST *vcn)
+static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
+ struct runs_tree *run, const CLST *vcn)
{
int err;
CLST svcn = le64_to_cpu(attr->nres.svcn);
@@ -140,7 +140,10 @@ failed:
}
if (lcn != SPARSE_LCN) {
- mark_as_free_ex(sbi, lcn, clen, trim);
+ if (sbi) {
+ /* mark bitmap range [lcn + clen) as free and trim clusters. */
+ mark_as_free_ex(sbi, lcn, clen, trim);
+ }
dn += clen;
}
@@ -173,7 +176,6 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
{
int err;
CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0;
- struct wnd_bitmap *wnd = &sbi->used.bitmap;
size_t cnt = run->count;
for (;;) {
@@ -196,9 +198,7 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
/* Add new fragment into run storage. */
if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) {
/* Undo last 'ntfs_look_for_free_space' */
- down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
- wnd_set_free(wnd, lcn, flen);
- up_write(&wnd->rw_lock);
+ mark_as_free_ex(sbi, lcn, len, false);
err = -ENOMEM;
goto out;
}
@@ -320,7 +320,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s),
attr_s->name_len, run, 0, alen,
- attr_s->flags, &attr, NULL);
+ attr_s->flags, &attr, NULL, NULL);
if (err)
goto out3;
@@ -419,40 +419,44 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
struct mft_inode *mi, *mi_b;
CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn;
CLST next_svcn, pre_alloc = -1, done = 0;
- bool is_ext;
+ bool is_ext, is_bad = false;
u32 align;
struct MFT_REC *rec;
again:
+ alen = 0;
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL,
&mi_b);
if (!attr_b) {
err = -ENOENT;
- goto out;
+ goto bad_inode;
}
if (!attr_b->non_res) {
err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run,
&attr_b);
- if (err || !attr_b->non_res)
- goto out;
+ if (err)
+ return err;
+
+ /* Return if file is still resident. */
+ if (!attr_b->non_res)
+ goto ok1;
/* Layout of records may be changed, so do a full search. */
goto again;
}
is_ext = is_attr_ext(attr_b);
-
-again_1:
align = sbi->cluster_size;
-
if (is_ext)
align <<= attr_b->nres.c_unit;
old_valid = le64_to_cpu(attr_b->nres.valid_size);
old_size = le64_to_cpu(attr_b->nres.data_size);
old_alloc = le64_to_cpu(attr_b->nres.alloc_size);
+
+again_1:
old_alen = old_alloc >> cluster_bits;
new_alloc = (new_size + align - 1) & ~(u64)(align - 1);
@@ -475,24 +479,27 @@ again_1:
mi = mi_b;
} else if (!le_b) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
} else {
le = le_b;
attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn,
&mi);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
next_le_1:
svcn = le64_to_cpu(attr->nres.svcn);
evcn = le64_to_cpu(attr->nres.evcn);
}
-
+ /*
+ * Here we have:
+ * attr,mi,le - last attribute segment (containing 'vcn').
+ * attr_b,mi_b,le_b - base (primary) attribute segment.
+ */
next_le:
rec = mi->mrec;
-
err = attr_load_runs(attr, ni, run, NULL);
if (err)
goto out;
@@ -507,6 +514,13 @@ next_le:
goto ok;
}
+ /*
+ * Add clusters. In simple case we have to:
+ * - allocate space (vcn, lcn, len)
+ * - update packed run in 'mi'
+ * - update attr->nres.evcn
+ * - update attr_b->nres.data_size/attr_b->nres.alloc_size
+ */
to_allocate = new_alen - old_alen;
add_alloc_in_same_attr_seg:
lcn = 0;
@@ -520,9 +534,11 @@ add_alloc_in_same_attr_seg:
pre_alloc = 0;
if (type == ATTR_DATA && !name_len &&
sbi->options->prealloc) {
- CLST new_alen2 = bytes_to_cluster(
- sbi, get_pre_allocated(new_size));
- pre_alloc = new_alen2 - new_alen;
+ pre_alloc =
+ bytes_to_cluster(
+ sbi,
+ get_pre_allocated(new_size)) -
+ new_alen;
}
/* Get the last LCN to allocate from. */
@@ -580,7 +596,7 @@ add_alloc_in_same_attr_seg:
pack_runs:
err = mi_pack_runs(mi, attr, run, vcn - svcn);
if (err)
- goto out;
+ goto undo_1;
next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
new_alloc_tmp = (u64)next_svcn << cluster_bits;
@@ -614,7 +630,7 @@ pack_runs:
if (type == ATTR_LIST) {
err = ni_expand_list(ni);
if (err)
- goto out;
+ goto undo_2;
if (next_svcn < vcn)
goto pack_runs;
@@ -624,8 +640,9 @@ pack_runs:
if (!ni->attr_list.size) {
err = ni_create_attr_list(ni);
+ /* In case of error layout of records is not changed. */
if (err)
- goto out;
+ goto undo_2;
/* Layout of records is changed. */
}
@@ -637,48 +654,57 @@ pack_runs:
/* Insert new attribute segment. */
err = ni_insert_nonresident(ni, type, name, name_len, run,
next_svcn, vcn - next_svcn,
- attr_b->flags, &attr, &mi);
- if (err)
- goto out;
-
- if (!is_mft)
- run_truncate_head(run, evcn + 1);
-
- svcn = le64_to_cpu(attr->nres.svcn);
- evcn = le64_to_cpu(attr->nres.evcn);
+ attr_b->flags, &attr, &mi, NULL);
- le_b = NULL;
/*
* Layout of records maybe changed.
* Find base attribute to update.
*/
+ le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len,
NULL, &mi_b);
if (!attr_b) {
- err = -ENOENT;
- goto out;
+ err = -EINVAL;
+ goto bad_inode;
}
- attr_b->nres.alloc_size = cpu_to_le64((u64)vcn << cluster_bits);
- attr_b->nres.data_size = attr_b->nres.alloc_size;
- attr_b->nres.valid_size = attr_b->nres.alloc_size;
+ if (err) {
+ /* ni_insert_nonresident failed. */
+ attr = NULL;
+ goto undo_2;
+ }
+
+ if (!is_mft)
+ run_truncate_head(run, evcn + 1);
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+
+ /*
+ * Attribute is in consistency state.
+ * Save this point to restore to if next steps fail.
+ */
+ old_valid = old_size = old_alloc = (u64)vcn << cluster_bits;
+ attr_b->nres.valid_size = attr_b->nres.data_size =
+ attr_b->nres.alloc_size = cpu_to_le64(old_size);
mi_b->dirty = true;
goto again_1;
}
if (new_size != old_size ||
(new_alloc != old_alloc && !keep_prealloc)) {
+ /*
+ * Truncate clusters. In simple case we have to:
+ * - update packed run in 'mi'
+ * - update attr->nres.evcn
+ * - update attr_b->nres.data_size/attr_b->nres.alloc_size
+ * - mark and trim clusters as free (vcn, lcn, len)
+ */
+ CLST dlen = 0;
+
vcn = max(svcn, new_alen);
new_alloc_tmp = (u64)vcn << cluster_bits;
- alen = 0;
- err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &alen,
- true);
- if (err)
- goto out;
-
- run_truncate(run, vcn);
-
if (vcn > svcn) {
err = mi_pack_runs(mi, attr, run, vcn - svcn);
if (err)
@@ -697,7 +723,7 @@ pack_runs:
if (!al_remove_le(ni, le)) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz);
@@ -723,12 +749,20 @@ pack_runs:
attr_b->nres.valid_size =
attr_b->nres.alloc_size;
}
+ mi_b->dirty = true;
- if (is_ext)
+ err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen,
+ true);
+ if (err)
+ goto out;
+
+ if (is_ext) {
+ /* dlen - really deallocated clusters. */
le64_sub_cpu(&attr_b->nres.total_size,
- ((u64)alen << cluster_bits));
+ ((u64)dlen << cluster_bits));
+ }
- mi_b->dirty = true;
+ run_truncate(run, vcn);
if (new_alloc_tmp <= new_alloc)
goto ok;
@@ -747,7 +781,7 @@ pack_runs:
if (le->type != type || le->name_len != name_len ||
memcmp(le_name(le), name, name_len * sizeof(short))) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
err = ni_load_mi(ni, le, &mi);
@@ -757,7 +791,7 @@ pack_runs:
attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
goto next_le_1;
}
@@ -772,13 +806,13 @@ ok:
}
}
-out:
- if (!err && attr_b && ret)
+ok1:
+ if (ret)
*ret = attr_b;
/* Update inode_set_bytes. */
- if (!err && ((type == ATTR_DATA && !name_len) ||
- (type == ATTR_ALLOC && name == I30_NAME))) {
+ if (((type == ATTR_DATA && !name_len) ||
+ (type == ATTR_ALLOC && name == I30_NAME))) {
bool dirty = false;
if (ni->vfs_inode.i_size != new_size) {
@@ -786,7 +820,7 @@ out:
dirty = true;
}
- if (attr_b && attr_b->non_res) {
+ if (attr_b->non_res) {
new_alloc = le64_to_cpu(attr_b->nres.alloc_size);
if (inode_get_bytes(&ni->vfs_inode) != new_alloc) {
inode_set_bytes(&ni->vfs_inode, new_alloc);
@@ -800,6 +834,47 @@ out:
}
}
+ return 0;
+
+undo_2:
+ vcn -= alen;
+ attr_b->nres.data_size = cpu_to_le64(old_size);
+ attr_b->nres.valid_size = cpu_to_le64(old_valid);
+ attr_b->nres.alloc_size = cpu_to_le64(old_alloc);
+
+ /* Restore 'attr' and 'mi'. */
+ if (attr)
+ goto restore_run;
+
+ if (le64_to_cpu(attr_b->nres.svcn) <= svcn &&
+ svcn <= le64_to_cpu(attr_b->nres.evcn)) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, type, name, name_len,
+ &svcn, &mi);
+ if (!attr)
+ goto bad_inode;
+ }
+
+restore_run:
+ if (mi_pack_runs(mi, attr, run, evcn - svcn + 1))
+ is_bad = true;
+
+undo_1:
+ run_deallocate_ex(sbi, run, vcn, alen, NULL, false);
+
+ run_truncate(run, vcn);
+out:
+ if (is_bad) {
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ }
return err;
}
@@ -855,7 +930,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
goto out;
}
- asize = le64_to_cpu(attr_b->nres.alloc_size) >> sbi->cluster_bits;
+ asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits;
if (vcn >= asize) {
err = -EINVAL;
goto out;
@@ -1047,7 +1122,7 @@ ins_ext:
if (evcn1 > next_svcn) {
err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
next_svcn, evcn1 - next_svcn,
- attr_b->flags, &attr, &mi);
+ attr_b->flags, &attr, &mi, NULL);
if (err)
goto out;
}
@@ -1173,7 +1248,7 @@ int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type,
{
struct ntfs_sb_info *sbi = ni->mi.sbi;
u8 cluster_bits = sbi->cluster_bits;
- CLST vcn = from >> cluster_bits;
+ CLST vcn;
CLST vcn_last = (to - 1) >> cluster_bits;
CLST lcn, clen;
int err;
@@ -1647,7 +1722,7 @@ ins_ext:
if (evcn1 > next_svcn) {
err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
next_svcn, evcn1 - next_svcn,
- attr_b->flags, &attr, &mi);
+ attr_b->flags, &attr, &mi, NULL);
if (err)
goto out;
}
@@ -1812,18 +1887,12 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
err = ni_insert_nonresident(
ni, ATTR_DATA, NULL, 0, run, next_svcn,
evcn1 - eat - next_svcn, a_flags, &attr,
- &mi);
+ &mi, &le);
if (err)
goto out;
/* Layout of records maybe changed. */
attr_b = NULL;
- le = al_find_ex(ni, NULL, ATTR_DATA, NULL, 0,
- &next_svcn);
- if (!le) {
- err = -EINVAL;
- goto out;
- }
}
/* Free all allocated memory. */
@@ -1918,7 +1987,7 @@ next_attr:
out:
up_write(&ni->file.run_lock);
if (err)
- make_bad_inode(&ni->vfs_inode);
+ _ntfs_bad_inode(&ni->vfs_inode);
return err;
}
@@ -1936,9 +2005,11 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST svcn, evcn1, vcn, len, end, alen, dealloc;
+ CLST svcn, evcn1, vcn, len, end, alen, hole, next_svcn;
u64 total_size, alloc_size;
u32 mask;
+ __le16 a_flags;
+ struct runs_tree run2;
if (!bytes)
return 0;
@@ -1990,6 +2061,9 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
}
down_write(&ni->file.run_lock);
+ run_init(&run2);
+ run_truncate(run, 0);
+
/*
* Enumerate all attribute segments and punch hole where necessary.
*/
@@ -1997,10 +2071,11 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
vcn = vbo >> sbi->cluster_bits;
len = bytes >> sbi->cluster_bits;
end = vcn + len;
- dealloc = 0;
+ hole = 0;
svcn = le64_to_cpu(attr_b->nres.svcn);
evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+ a_flags = attr_b->flags;
if (svcn <= vcn && vcn < evcn1) {
attr = attr_b;
@@ -2008,14 +2083,14 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
mi = mi_b;
} else if (!le_b) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
} else {
le = le_b;
attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
&mi);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
svcn = le64_to_cpu(attr->nres.svcn);
@@ -2023,49 +2098,91 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
}
while (svcn < end) {
- CLST vcn1, zero, dealloc2;
+ CLST vcn1, zero, hole2 = hole;
err = attr_load_runs(attr, ni, run, &svcn);
if (err)
- goto out;
+ goto done;
vcn1 = max(vcn, svcn);
zero = min(end, evcn1) - vcn1;
- dealloc2 = dealloc;
- err = run_deallocate_ex(sbi, run, vcn1, zero, &dealloc, true);
+ /*
+ * Check range [vcn1 + zero).
+ * Calculate how many clusters there are.
+ * Don't do any destructive actions.
+ */
+ err = run_deallocate_ex(NULL, run, vcn1, zero, &hole2, false);
if (err)
- goto out;
+ goto done;
- if (dealloc2 == dealloc) {
- /* Looks like the required range is already sparsed. */
- } else {
- if (!run_add_entry(run, vcn1, SPARSE_LCN, zero,
- false)) {
- err = -ENOMEM;
- goto out;
- }
+ /* Check if required range is already hole. */
+ if (hole2 == hole)
+ goto next_attr;
+
+ /* Make a clone of run to undo. */
+ err = run_clone(run, &run2);
+ if (err)
+ goto done;
+
+ /* Make a hole range (sparse) [vcn1 + zero). */
+ if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, false)) {
+ err = -ENOMEM;
+ goto done;
+ }
- err = mi_pack_runs(mi, attr, run, evcn1 - svcn);
+ /* Update run in attribute segment. */
+ err = mi_pack_runs(mi, attr, run, evcn1 - svcn);
+ if (err)
+ goto done;
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+ if (next_svcn < evcn1) {
+ /* Insert new attribute segment. */
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn,
+ evcn1 - next_svcn, a_flags,
+ &attr, &mi, &le);
if (err)
- goto out;
+ goto undo_punch;
+
+ /* Layout of records maybe changed. */
+ attr_b = NULL;
}
+
+ /* Real deallocate. Should not fail. */
+ run_deallocate_ex(sbi, &run2, vcn1, zero, &hole, true);
+
+next_attr:
/* Free all allocated memory. */
run_truncate(run, 0);
if (evcn1 >= alen)
break;
+ /* Get next attribute segment. */
attr = ni_enum_attr_ex(ni, attr, &le, &mi);
if (!attr) {
err = -EINVAL;
- goto out;
+ goto bad_inode;
}
svcn = le64_to_cpu(attr->nres.svcn);
evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
}
- total_size -= (u64)dealloc << sbi->cluster_bits;
+done:
+ if (!hole)
+ goto out;
+
+ if (!attr_b) {
+ attr_b = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+ }
+
+ total_size -= (u64)hole << sbi->cluster_bits;
attr_b->nres.total_size = cpu_to_le64(total_size);
mi_b->dirty = true;
@@ -2075,9 +2192,263 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
mark_inode_dirty(&ni->vfs_inode);
out:
+ run_close(&run2);
up_write(&ni->file.run_lock);
+ return err;
+
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ goto out;
+
+undo_punch:
+ /*
+ * Restore packed runs.
+ * 'mi_pack_runs' should not fail, cause we restore original.
+ */
+ if (mi_pack_runs(mi, attr, &run2, evcn1 - svcn))
+ goto bad_inode;
+
+ goto done;
+}
+
+/*
+ * attr_insert_range - Insert range (hole) in file.
+ * Not for normal files.
+ */
+int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST vcn, svcn, evcn1, len, next_svcn;
+ u64 data_size, alloc_size;
+ u32 mask;
+ __le16 a_flags;
+
+ if (!bytes)
+ return 0;
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
+ if (!attr_b)
+ return -ENOENT;
+
+ if (!is_attr_ext(attr_b)) {
+ /* It was checked above. See fallocate. */
+ return -EOPNOTSUPP;
+ }
+
+ if (!attr_b->non_res) {
+ data_size = le32_to_cpu(attr_b->res.data_size);
+ alloc_size = data_size;
+ mask = sbi->cluster_mask; /* cluster_size - 1 */
+ } else {
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1;
+ }
+
+ if (vbo > data_size) {
+ /* Insert range after the file size is not allowed. */
+ return -EINVAL;
+ }
+
+ if ((vbo & mask) || (bytes & mask)) {
+ /* Allow to insert only frame aligned ranges. */
+ return -EINVAL;
+ }
+
+ /*
+ * valid_size <= data_size <= alloc_size
+ * Check alloc_size for maximum possible.
+ */
+ if (bytes > sbi->maxbytes_sparse - alloc_size)
+ return -EFBIG;
+
+ vcn = vbo >> sbi->cluster_bits;
+ len = bytes >> sbi->cluster_bits;
+
+ down_write(&ni->file.run_lock);
+
+ if (!attr_b->non_res) {
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, run,
+ data_size + bytes, NULL, false, NULL);
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ if (err)
+ goto out;
+
+ if (!attr_b->non_res) {
+ /* Still resident. */
+ char *data = Add2Ptr(attr_b, attr_b->res.data_off);
+
+ memmove(data + bytes, data, bytes);
+ memset(data, 0, bytes);
+ goto done;
+ }
+
+ /* Resident files becomes nonresident. */
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ }
+
+ /*
+ * Enumerate all attribute segments and shift start vcn.
+ */
+ a_flags = attr_b->flags;
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ run_truncate(run, 0); /* clear cached values. */
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+
+ if (!run_insert_range(run, vcn, len)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Try to pack in current record as much as possible. */
+ err = mi_pack_runs(mi, attr, run, evcn1 + len - svcn);
if (err)
- make_bad_inode(&ni->vfs_inode);
+ goto out;
+
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) &&
+ attr->type == ATTR_DATA && !attr->name_len) {
+ le64_add_cpu(&attr->nres.svcn, len);
+ le64_add_cpu(&attr->nres.evcn, len);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ mi->dirty = true;
+ }
+
+ if (next_svcn < evcn1 + len) {
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn, evcn1 + len - next_svcn,
+ a_flags, NULL, NULL, NULL);
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ if (err) {
+ /* ni_insert_nonresident failed. Try to undo. */
+ goto undo_insert_range;
+ }
+ }
+
+ /*
+ * Update primary attribute segment.
+ */
+ if (vbo <= ni->i_valid)
+ ni->i_valid += bytes;
+
+ attr_b->nres.data_size = le64_to_cpu(data_size + bytes);
+ attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes);
+
+ /* ni->valid may be not equal valid_size (temporary). */
+ if (ni->i_valid > data_size + bytes)
+ attr_b->nres.valid_size = attr_b->nres.data_size;
+ else
+ attr_b->nres.valid_size = cpu_to_le64(ni->i_valid);
+ mi_b->dirty = true;
+
+done:
+ ni->vfs_inode.i_size += bytes;
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ mark_inode_dirty(&ni->vfs_inode);
+
+out:
+ run_truncate(run, 0); /* clear cached values. */
+
+ up_write(&ni->file.run_lock);
return err;
+
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ goto out;
+
+undo_insert_range:
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ if (attr_load_runs(attr, ni, run, NULL))
+ goto bad_inode;
+
+ if (!run_collapse_range(run, vcn, len))
+ goto bad_inode;
+
+ if (mi_pack_runs(mi, attr, run, evcn1 + len - svcn))
+ goto bad_inode;
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) &&
+ attr->type == ATTR_DATA && !attr->name_len) {
+ le64_sub_cpu(&attr->nres.svcn, len);
+ le64_sub_cpu(&attr->nres.evcn, len);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ mi->dirty = true;
+ }
+
+ goto out;
}
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index aa184407520f..5d44ceac855b 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -51,11 +51,6 @@ void ntfs3_exit_bitmap(void)
kmem_cache_destroy(ntfs_enode_cachep);
}
-static inline u32 wnd_bits(const struct wnd_bitmap *wnd, size_t i)
-{
- return i + 1 == wnd->nwnd ? wnd->bits_last : wnd->sb->s_blocksize * 8;
-}
-
/*
* wnd_scan
*
@@ -1333,9 +1328,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
if (!new_free)
return -ENOMEM;
- if (new_free != wnd->free_bits)
- memcpy(new_free, wnd->free_bits,
- wnd->nwnd * sizeof(short));
+ memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short));
memset(new_free + wnd->nwnd, 0,
(new_wnd - wnd->nwnd) * sizeof(short));
kfree(wnd->free_bits);
@@ -1395,9 +1388,8 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len)
{
- size_t zlen;
+ size_t zlen = wnd->zone_end - wnd->zone_bit;
- zlen = wnd->zone_end - wnd->zone_bit;
if (zlen)
wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 787b53b984ee..4f2ffc7ef296 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -22,20 +22,20 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
{
struct fstrim_range __user *user_range;
struct fstrim_range range;
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sbi->sb->s_bdev))
return -EOPNOTSUPP;
user_range = (struct fstrim_range __user *)arg;
if (copy_from_user(&range, user_range, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u32, range.minlen, q->limits.discard_granularity);
+ range.minlen = max_t(u32, range.minlen,
+ bdev_discard_granularity(sbi->sb->s_bdev));
err = ntfs_trim_fs(sbi, &range);
if (err < 0)
@@ -115,7 +115,6 @@ static int ntfs_extend_initialized_size(struct file *file,
for (;;) {
u32 zerofrom, len;
struct page *page;
- void *fsdata;
u8 bits;
CLST vcn, lcn, clen;
@@ -157,16 +156,14 @@ static int ntfs_extend_initialized_size(struct file *file,
if (pos + len > new_valid)
len = new_valid - pos;
- err = pagecache_write_begin(file, mapping, pos, len, 0, &page,
- &fsdata);
+ err = ntfs_write_begin(file, mapping, pos, len, &page, NULL);
if (err)
goto out;
zero_user_segment(page, zerofrom, PAGE_SIZE);
/* This function in any case puts page. */
- err = pagecache_write_end(file, mapping, pos, len, len, page,
- fsdata);
+ err = ntfs_write_end(file, mapping, pos, len, len, page, NULL);
if (err < 0)
goto out;
pos += len;
@@ -245,7 +242,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
lock_buffer(bh);
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
- submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
@@ -495,7 +492,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
down_write(&ni->file.run_lock);
err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
- &new_valid, true, NULL);
+ &new_valid, ni->mi.sbi->options->prealloc, NULL);
up_write(&ni->file.run_lock);
if (new_valid < ni->i_valid)
@@ -533,21 +530,35 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
{
struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
loff_t end = vbo + len;
loff_t vbo_down = round_down(vbo, PAGE_SIZE);
- loff_t i_size;
+ bool is_supported_holes = is_sparsed(ni) || is_compressed(ni);
+ loff_t i_size, new_size;
+ bool map_locked;
int err;
/* No support for dir. */
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- /* Return error if mode is not supported. */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE)) {
+ /*
+ * vfs_fallocate checks all possible combinations of mode.
+ * Do additional checks here before ntfs_set_state(dirty).
+ */
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (!is_supported_holes)
+ return -EOPNOTSUPP;
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ if (!is_supported_holes)
+ return -EOPNOTSUPP;
+ } else if (mode &
+ ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)) {
ntfs_inode_warn(inode, "fallocate(0x%x) is not supported",
mode);
return -EOPNOTSUPP;
@@ -557,6 +568,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
inode_lock(inode);
i_size = inode->i_size;
+ new_size = max(end, i_size);
+ map_locked = false;
if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
/* Should never be here, see ntfs_file_open. */
@@ -564,38 +577,27 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
goto out;
}
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE)) {
+ inode_dio_wait(inode);
+ filemap_invalidate_lock(mapping);
+ map_locked = true;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
u32 frame_size;
loff_t mask, vbo_a, end_a, tmp;
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
- err = -EINVAL;
- goto out;
- }
-
- err = filemap_write_and_wait_range(inode->i_mapping, vbo,
- end - 1);
+ err = filemap_write_and_wait_range(mapping, vbo, end - 1);
if (err)
goto out;
- err = filemap_write_and_wait_range(inode->i_mapping, end,
- LLONG_MAX);
+ err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (err)
goto out;
- inode_dio_wait(inode);
-
truncate_pagecache(inode, vbo_down);
- if (!is_sparsed(ni) && !is_compressed(ni)) {
- /*
- * Normal file, can't make hole.
- * TODO: Try to find way to save info about hole.
- */
- err = -EOPNOTSUPP;
- goto out;
- }
-
ni_lock(ni);
err = attr_punch_hole(ni, vbo, len, &frame_size);
ni_unlock(ni);
@@ -627,17 +629,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_unlock(ni);
}
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- if (mode & ~FALLOC_FL_COLLAPSE_RANGE) {
- err = -EINVAL;
- goto out;
- }
-
/*
* Write tail of the last page before removed range since
* it will get removed from the page cache below.
*/
- err = filemap_write_and_wait_range(inode->i_mapping, vbo_down,
- vbo);
+ err = filemap_write_and_wait_range(mapping, vbo_down, vbo);
if (err)
goto out;
@@ -645,28 +641,58 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
* Write data that will be shifted to preserve them
* when discarding page cache below.
*/
- err = filemap_write_and_wait_range(inode->i_mapping, end,
- LLONG_MAX);
+ err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (err)
goto out;
- /* Wait for existing dio to complete. */
- inode_dio_wait(inode);
-
truncate_pagecache(inode, vbo_down);
ni_lock(ni);
err = attr_collapse_range(ni, vbo, len);
ni_unlock(ni);
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ /* Check new size. */
+ err = inode_newsize_ok(inode, new_size);
+ if (err)
+ goto out;
+
+ /* Write out all dirty pages. */
+ err = filemap_write_and_wait_range(mapping, vbo_down,
+ LLONG_MAX);
+ if (err)
+ goto out;
+ truncate_pagecache(inode, vbo_down);
+
+ ni_lock(ni);
+ err = attr_insert_range(ni, vbo, len);
+ ni_unlock(ni);
} else {
+ /* Check new size. */
+
+ /* generic/213: expected -ENOSPC instead of -EFBIG. */
+ if (!is_supported_holes) {
+ loff_t to_alloc = new_size - inode_get_bytes(inode);
+
+ if (to_alloc > 0 &&
+ (to_alloc >> sbi->cluster_bits) >
+ wnd_zeroes(&sbi->used.bitmap)) {
+ err = -ENOSPC;
+ goto out;
+ }
+ }
+
+ err = inode_newsize_ok(inode, new_size);
+ if (err)
+ goto out;
+
/*
- * Normal file: Allocate clusters, do not change 'valid' size.
+ * Allocate clusters, do not change 'valid' size.
*/
- err = ntfs_set_size(inode, max(end, i_size));
+ err = ntfs_set_size(inode, new_size);
if (err)
goto out;
- if (is_sparsed(ni) || is_compressed(ni)) {
+ if (is_supported_holes) {
CLST vcn_v = ni->i_valid >> sbi->cluster_bits;
CLST vcn = vbo >> sbi->cluster_bits;
CLST cend = bytes_to_cluster(sbi, end);
@@ -714,8 +740,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
}
out:
- if (err == -EFBIG)
- err = -ENOSPC;
+ if (map_locked)
+ filemap_invalidate_unlock(mapping);
if (!err) {
inode->i_ctime = inode->i_mtime = current_time(inode);
@@ -762,7 +788,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
inode_dio_wait(inode);
- if (attr->ia_size < oldsize)
+ if (attr->ia_size <= oldsize)
err = ntfs_truncate(inode, attr->ia_size);
else if (attr->ia_size > oldsize)
err = ntfs_extend(inode, attr->ia_size, 0, NULL);
@@ -986,7 +1012,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
if (bytes > count)
bytes = count;
- frame = pos >> frame_bits;
frame_vbo = pos & ~(frame_size - 1);
index = frame_vbo >> PAGE_SHIFT;
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 6f47a9c17f89..381a38a06ec2 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -7,6 +7,7 @@
#include <linux/fiemap.h>
#include <linux/fs.h>
+#include <linux/minmax.h>
#include <linux/vmalloc.h>
#include "debug.h"
@@ -468,7 +469,7 @@ ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi,
&ref, &le);
if (err) {
/* No memory or no space. */
- return NULL;
+ return ERR_PTR(err);
}
le_added = true;
@@ -649,6 +650,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
struct mft_inode *mi;
u32 asize, free;
struct MFT_REF ref;
+ struct MFT_REC *mrec;
__le16 id;
if (!ni->attr_list.dirty)
@@ -692,11 +694,17 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
free -= asize;
}
+ /* Make a copy of primary record to restore if error. */
+ mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS);
+ if (!mrec)
+ return 0; /* Not critical. */
+
/* It seems that attribute list can be removed from primary record. */
mi_remove_attr(NULL, &ni->mi, attr_list);
/*
- * Repeat the cycle above and move all attributes to primary record.
+ * Repeat the cycle above and copy all attributes to primary record.
+ * Do not remove original attributes from subrecords!
* It should be success!
*/
le = NULL;
@@ -707,14 +715,14 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
mi = ni_find_mi(ni, ino_get(&le->ref));
if (!mi) {
/* Should never happened, 'cause already checked. */
- goto bad;
+ goto out;
}
attr = mi_find_attr(mi, NULL, le->type, le_name(le),
le->name_len, &le->id);
if (!attr) {
/* Should never happened, 'cause already checked. */
- goto bad;
+ goto out;
}
asize = le32_to_cpu(attr->size);
@@ -724,18 +732,33 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
le16_to_cpu(attr->name_off));
if (!attr_ins) {
/*
- * Internal error.
- * Either no space in primary record (already checked).
- * Either tried to insert another
- * non indexed attribute (logic error).
+ * No space in primary record (already checked).
*/
- goto bad;
+ goto out;
}
/* Copy all except id. */
id = attr_ins->id;
memcpy(attr_ins, attr, asize);
attr_ins->id = id;
+ }
+
+ /*
+ * Repeat the cycle above and remove all attributes from subrecords.
+ */
+ le = NULL;
+ while ((le = al_enumerate(ni, le))) {
+ if (!memcmp(&le->ref, &ref, sizeof(ref)))
+ continue;
+
+ mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi)
+ continue;
+
+ attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ le->name_len, &le->id);
+ if (!attr)
+ continue;
/* Remove from original record. */
mi_remove_attr(NULL, mi, attr);
@@ -748,11 +771,13 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
ni->attr_list.le = NULL;
ni->attr_list.dirty = false;
+ kfree(mrec);
+ return 0;
+out:
+ /* Restore primary record. */
+ swap(mrec, ni->mi.mrec);
+ kfree(mrec);
return 0;
-bad:
- ntfs_inode_err(&ni->vfs_inode, "Internal error");
- make_bad_inode(&ni->vfs_inode);
- return -EINVAL;
}
/*
@@ -986,6 +1011,8 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
name_off, svcn, ins_le);
if (!attr)
continue;
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
if (ins_attr)
*ins_attr = attr;
@@ -1007,8 +1034,15 @@ insert_ext:
attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
name_off, svcn, ins_le);
- if (!attr)
+ if (!attr) {
+ err = -EINVAL;
+ goto out2;
+ }
+
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
goto out2;
+ }
if (ins_attr)
*ins_attr = attr;
@@ -1020,10 +1054,9 @@ insert_ext:
out2:
ni_remove_mi(ni, mi);
mi_put(mi);
- err = -EINVAL;
out1:
- ntfs_mark_rec_free(sbi, rno);
+ ntfs_mark_rec_free(sbi, rno, is_mft);
out:
return err;
@@ -1076,6 +1109,11 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
if (asize <= free) {
attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len,
asize, name_off, svcn, ins_le);
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
if (attr) {
if (ins_attr)
*ins_attr = attr;
@@ -1173,6 +1211,11 @@ static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
goto out;
}
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
if (ins_attr)
*ins_attr = attr;
if (ins_mi)
@@ -1218,7 +1261,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
mft_min = mft_new;
mi_min = mi_new;
} else {
- ntfs_mark_rec_free(sbi, mft_new);
+ ntfs_mark_rec_free(sbi, mft_new, true);
mft_new = 0;
ni_remove_mi(ni, mi_new);
}
@@ -1262,7 +1305,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
done = asize - run_size - SIZEOF_NONRESIDENT;
le32_sub_cpu(&ni->mi.mrec->used, done);
- /* Estimate the size of second part: run_buf=NULL. */
+ /* Estimate packed size (run_buf=NULL). */
err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size,
&plen);
if (err < 0)
@@ -1288,10 +1331,16 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
goto out;
}
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
attr->non_res = 1;
attr->name_off = SIZEOF_NONRESIDENT_LE;
attr->flags = 0;
+ /* This function can't fail - cause already checked above. */
run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT),
run_size, &plen);
@@ -1301,7 +1350,7 @@ static int ni_expand_mft_list(struct ntfs_inode *ni)
out:
if (mft_new) {
- ntfs_mark_rec_free(sbi, mft_new);
+ ntfs_mark_rec_free(sbi, mft_new, true);
ni_remove_mi(ni, mi_new);
}
@@ -1367,8 +1416,6 @@ int ni_expand_list(struct ntfs_inode *ni)
/* Split MFT data as much as possible. */
err = ni_expand_mft_list(ni);
- if (err)
- goto out;
out:
return !err && !done ? -EOPNOTSUPP : err;
@@ -1381,7 +1428,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
const __le16 *name, u8 name_len,
const struct runs_tree *run, CLST svcn, CLST len,
__le16 flags, struct ATTRIB **new_attr,
- struct mft_inode **mi)
+ struct mft_inode **mi, struct ATTR_LIST_ENTRY **le)
{
int err;
CLST plen;
@@ -1394,6 +1441,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
u32 run_size, asize;
struct ntfs_sb_info *sbi = ni->mi.sbi;
+ /* Estimate packed size (run_buf=NULL). */
err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off,
&plen);
if (err < 0)
@@ -1414,7 +1462,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
}
err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn,
- &attr, mi, NULL);
+ &attr, mi, le);
if (err)
goto out;
@@ -1423,12 +1471,12 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
attr->name_off = cpu_to_le16(name_off);
attr->flags = flags;
+ /* This function can't fail - cause already checked above. */
run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen);
attr->nres.svcn = cpu_to_le64(svcn);
attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1);
- err = 0;
if (new_attr)
*new_attr = attr;
@@ -1560,7 +1608,7 @@ int ni_delete_all(struct ntfs_inode *ni)
mi->dirty = true;
mi_write(mi, 0);
- ntfs_mark_rec_free(sbi, mi->rno);
+ ntfs_mark_rec_free(sbi, mi->rno, false);
ni_remove_mi(ni, mi);
mi_put(mi);
node = next;
@@ -1571,7 +1619,7 @@ int ni_delete_all(struct ntfs_inode *ni)
ni->mi.dirty = true;
err = mi_write(&ni->mi, 0);
- ntfs_mark_rec_free(sbi, ni->mi.rno);
+ ntfs_mark_rec_free(sbi, ni->mi.rno, false);
return err;
}
@@ -1589,7 +1637,8 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
struct ATTRIB *attr = NULL;
struct ATTR_FILE_NAME *fname;
- *le = NULL;
+ if (le)
+ *le = NULL;
/* Enumerate all names. */
next:
@@ -1605,7 +1654,7 @@ next:
goto next;
if (!uni)
- goto next;
+ return fname;
if (uni->len != fname->name_len)
goto next;
@@ -1964,10 +2013,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
vcn += clen;
- if (vbo + bytes >= end) {
+ if (vbo + bytes >= end)
bytes = end - vbo;
- flags |= FIEMAP_EXTENT_LAST;
- }
if (vbo + bytes <= valid) {
;
@@ -1977,6 +2024,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
/* vbo < valid && valid < vbo + bytes */
u64 dlen = valid - vbo;
+ if (vbo + dlen >= end)
+ flags |= FIEMAP_EXTENT_LAST;
+
err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen,
flags);
if (err < 0)
@@ -1995,6 +2045,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
+ if (vbo + bytes >= end)
+ flags |= FIEMAP_EXTENT_LAST;
+
err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags);
if (err < 0)
break;
@@ -2298,10 +2351,8 @@ remove_wof:
out:
kfree(pages);
- if (err) {
- make_bad_inode(inode);
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
- }
+ if (err)
+ _ntfs_bad_inode(inode);
return err;
}
@@ -2940,7 +2991,7 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
}
/*
- * ni_add_name - Add new name in MFT and in directory.
+ * ni_add_name - Add new name into MFT and into directory.
*/
int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct NTFS_DE *de)
@@ -2949,13 +3000,20 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct ATTRIB *attr;
struct ATTR_LIST_ENTRY *le;
struct mft_inode *mi;
+ struct ATTR_FILE_NAME *fname;
struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
u16 de_key_size = le16_to_cpu(de->key_size);
mi_get_ref(&ni->mi, &de->ref);
mi_get_ref(&dir_ni->mi, &de_name->home);
- /* Insert new name in MFT. */
+ /* Fill duplicate from any ATTR_NAME. */
+ fname = ni_fname_name(ni, NULL, NULL, NULL, NULL);
+ if (fname)
+ memcpy(&de_name->dup, &fname->dup, sizeof(fname->dup));
+ de_name->dup.fa = ni->std_fa;
+
+ /* Insert new name into MFT. */
err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr,
&mi, &le);
if (err)
@@ -2963,7 +3021,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size);
- /* Insert new name in directory. */
+ /* Insert new name into directory. */
err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0);
if (err)
ni_remove_attr_le(ni, attr, mi, le);
@@ -2987,7 +3045,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
* 1) Add new name and remove old name.
* 2) Remove old name and add new name.
*
- * In most cases (not all!) adding new name in MFT and in directory can
+ * In most cases (not all!) adding new name into MFT and into directory can
* allocate additional cluster(s).
* Second way may result to bad inode if we can't add new name
* and then can't restore (add) old name.
@@ -3257,7 +3315,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
err = err2;
if (is_empty) {
- ntfs_mark_rec_free(sbi, mi->rno);
+ ntfs_mark_rec_free(sbi, mi->rno, false);
rb_erase(node, &ni->mi_tree);
mi_put(mi);
}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 06492f088d60..e7c494005122 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -1185,8 +1185,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
if (!r_page)
return -ENOMEM;
- memset(info, 0, sizeof(struct restart_info));
-
/* Determine which restart area we are looking for. */
if (first) {
vbo = 0;
@@ -3791,10 +3789,11 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
if (!log)
return -ENOMEM;
+ memset(&rst_info, 0, sizeof(struct restart_info));
+
log->ni = ni;
log->l_size = l_size;
log->one_page_buf = kmalloc(page_size, GFP_NOFS);
-
if (!log->one_page_buf) {
err = -ENOMEM;
goto out;
@@ -3842,7 +3841,10 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
if (rst_info.vbo)
goto check_restart_area;
+ memset(&rst_info2, 0, sizeof(struct restart_info));
err = log_read_rst(log, l_size, false, &rst_info2);
+ if (err)
+ goto out;
/* Determine which restart area to use. */
if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn)
@@ -4085,8 +4087,10 @@ process_log:
if (client == LFS_NO_CLIENT_LE) {
/* Insert "NTFS" client LogFile. */
client = ra->client_idx[0];
- if (client == LFS_NO_CLIENT_LE)
- return -EINVAL;
+ if (client == LFS_NO_CLIENT_LE) {
+ err = -EINVAL;
+ goto out;
+ }
t16 = le16_to_cpu(client);
cr = ca + t16;
@@ -5055,7 +5059,7 @@ undo_action_next:
goto add_allocated_vcns;
vcn = le64_to_cpu(lrh->target_vcn);
- vcn &= ~(log->clst_per_page - 1);
+ vcn &= ~(u64)(log->clst_per_page - 1);
add_allocated_vcns:
for (i = 0, vcn = le64_to_cpu(lrh->target_vcn),
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 3de5700a9b83..4ed15f64b17f 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -703,12 +703,14 @@ out:
/*
* ntfs_mark_rec_free - Mark record as free.
+ * is_mft - true if we are changing MFT
*/
-void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno)
+void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
{
struct wnd_bitmap *wnd = &sbi->mft.bitmap;
- down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
+ if (!is_mft)
+ down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
if (rno >= wnd->nbits)
goto out;
@@ -727,7 +729,8 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno)
sbi->mft.next_free = rno;
out:
- up_write(&wnd->rw_lock);
+ if (!is_mft)
+ up_write(&wnd->rw_lock);
}
/*
@@ -780,7 +783,7 @@ out:
*/
int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
{
- CLST zone_limit, zone_max, lcn, vcn, len;
+ CLST lcn, vcn, len;
size_t lcn_s, zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
struct ntfs_inode *ni = sbi->mft.ni;
@@ -789,16 +792,6 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
if (wnd_zone_len(wnd))
return 0;
- /*
- * Compute the MFT zone at two steps.
- * It would be nice if we are able to allocate 1/8 of
- * total clusters for MFT but not more then 512 MB.
- */
- zone_limit = (512 * 1024 * 1024) >> sbi->cluster_bits;
- zone_max = wnd->nbits >> 3;
- if (zone_max > zone_limit)
- zone_max = zone_limit;
-
vcn = bytes_to_cluster(sbi,
(u64)sbi->mft.bitmap.nbits << sbi->record_bits);
@@ -812,13 +805,7 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
lcn_s = lcn + 1;
/* Try to allocate clusters after last MFT run. */
- zlen = wnd_find(wnd, zone_max, lcn_s, 0, &lcn_s);
- if (!zlen) {
- ntfs_notice(sbi->sb, "MftZone: unavailable");
- return 0;
- }
-
- /* Truncate too large zone. */
+ zlen = wnd_find(wnd, sbi->zone_max, lcn_s, 0, &lcn_s);
wnd_zone_set(wnd, lcn_s, zlen);
return 0;
@@ -827,16 +814,21 @@ int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
/*
* ntfs_update_mftmirr - Update $MFTMirr data.
*/
-int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
+void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
{
int err;
struct super_block *sb = sbi->sb;
- u32 blocksize = sb->s_blocksize;
+ u32 blocksize;
sector_t block1, block2;
u32 bytes;
+ if (!sb)
+ return;
+
+ blocksize = sb->s_blocksize;
+
if (!(sbi->flags & NTFS_FLAGS_MFTMIRR))
- return 0;
+ return;
err = 0;
bytes = sbi->mft.recs_mirr << sbi->record_bits;
@@ -847,16 +839,13 @@ int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
struct buffer_head *bh1, *bh2;
bh1 = sb_bread(sb, block1++);
- if (!bh1) {
- err = -EIO;
- goto out;
- }
+ if (!bh1)
+ return;
bh2 = sb_getblk(sb, block2++);
if (!bh2) {
put_bh(bh1);
- err = -EIO;
- goto out;
+ return;
}
if (buffer_locked(bh2))
@@ -876,13 +865,24 @@ int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
put_bh(bh2);
if (err)
- goto out;
+ return;
}
sbi->flags &= ~NTFS_FLAGS_MFTMIRR;
+}
-out:
- return err;
+/*
+ * ntfs_bad_inode
+ *
+ * Marks inode as bad and marks fs as 'dirty'
+ */
+void ntfs_bad_inode(struct inode *inode, const char *hint)
+{
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+
+ ntfs_inode_err(inode, "%s", hint);
+ make_bad_inode(inode);
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}
/*
@@ -1395,7 +1395,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
if (buffer_locked(bh))
__wait_on_buffer(bh);
- lock_buffer(nb->bh[idx]);
+ lock_buffer(bh);
bh_data = bh->b_data + off;
end_data = Add2Ptr(bh_data, op);
@@ -1448,7 +1448,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
*/
int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
struct page **pages, u32 nr_pages, u64 vbo, u32 bytes,
- u32 op)
+ enum req_op op)
{
int err = 0;
struct bio *new, *bio = NULL;
@@ -2424,7 +2424,7 @@ static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn,
void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
{
- CLST end, i;
+ CLST end, i, zone_len, zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
@@ -2459,6 +2459,28 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
ntfs_unmap_and_discard(sbi, lcn, len);
wnd_set_free(wnd, lcn, len);
+ /* append to MFT zone, if possible. */
+ zone_len = wnd_zone_len(wnd);
+ zlen = min(zone_len + len, sbi->zone_max);
+
+ if (zlen == zone_len) {
+ /* MFT zone already has maximum size. */
+ } else if (!zone_len) {
+ /* Create MFT zone only if 'zlen' is large enough. */
+ if (zlen == sbi->zone_max)
+ wnd_zone_set(wnd, lcn, zlen);
+ } else {
+ CLST zone_lcn = wnd_zone_bit(wnd);
+
+ if (lcn + len == zone_lcn) {
+ /* Append into head MFT zone. */
+ wnd_zone_set(wnd, lcn, zlen);
+ } else if (zone_lcn + zone_len == lcn) {
+ /* Append into tail MFT zone. */
+ wnd_zone_set(wnd, zone_lcn, zlen);
+ }
+ }
+
out:
up_write(&wnd->rw_lock);
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 6f81e3a49abf..440328147e7e 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1042,19 +1042,16 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
{
int err;
struct NTFS_DE *e;
- const struct INDEX_HDR *hdr;
struct indx_node *node;
if (!root)
root = indx_get_root(&ni->dir, ni, NULL, NULL);
if (!root) {
- err = -EINVAL;
- goto out;
+ /* Should not happen. */
+ return -EINVAL;
}
- hdr = &root->ihdr;
-
/* Check cache. */
e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de;
if (e && !de_is_last(e) &&
@@ -1068,39 +1065,35 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
fnd_clear(fnd);
/* Lookup entry that is <= to the search value. */
- e = hdr_find_e(indx, hdr, key, key_len, ctx, diff);
+ e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff);
if (!e)
return -EINVAL;
fnd->root_de = e;
- err = 0;
for (;;) {
node = NULL;
- if (*diff >= 0 || !de_has_vcn_ex(e)) {
- *entry = e;
- goto out;
- }
+ if (*diff >= 0 || !de_has_vcn_ex(e))
+ break;
/* Read next level. */
err = indx_read(indx, ni, de_get_vbn(e), &node);
if (err)
- goto out;
+ return err;
/* Lookup entry that is <= to the search value. */
e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx,
diff);
if (!e) {
- err = -EINVAL;
put_indx_node(node);
- goto out;
+ return -EINVAL;
}
fnd_push(fnd, node, e);
}
-out:
- return err;
+ *entry = e;
+ return 0;
}
int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni,
@@ -1354,7 +1347,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
goto out;
err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len,
- &run, 0, len, 0, &alloc, NULL);
+ &run, 0, len, 0, &alloc, NULL, NULL);
if (err)
goto out1;
@@ -1685,8 +1678,8 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
{
int err;
const struct NTFS_DE *sp;
- struct NTFS_DE *e, *de_t, *up_e = NULL;
- struct indx_node *n2 = NULL;
+ struct NTFS_DE *e, *de_t, *up_e;
+ struct indx_node *n2;
struct indx_node *n1 = fnd->nodes[level];
struct INDEX_HDR *hdr1 = &n1->index->ihdr;
struct INDEX_HDR *hdr2;
@@ -1994,7 +1987,7 @@ static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni,
const struct NTFS_DE *e, bool trim)
{
int err;
- struct indx_node *n;
+ struct indx_node *n = NULL;
struct INDEX_HDR *hdr;
CLST vbn = de_get_vbn(e);
size_t i;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 9eab11e3b034..26a76ebfe58f 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -430,6 +430,7 @@ end_enum:
} else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) &&
fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) {
/* Records in $Extend are not a files or general directories. */
+ inode->i_op = &ntfs_file_inode_operations;
} else {
err = -EINVAL;
goto out;
@@ -500,7 +501,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
inode = ntfs_read_mft(inode, name, ref);
else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) {
/* Inode overlaps? */
- make_bad_inode(inode);
+ _ntfs_bad_inode(inode);
}
return inode;
@@ -629,7 +630,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
set_bh_page(bh, page, off);
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
err = -EIO;
@@ -676,8 +677,9 @@ static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, ntfs_get_block_bmap);
}
-static int ntfs_readpage(struct file *file, struct page *page)
+static int ntfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
int err;
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
@@ -701,7 +703,7 @@ static int ntfs_readpage(struct file *file, struct page *page)
}
/* Normal + sparse files. */
- return mpage_readpage(page, ntfs_get_block);
+ return mpage_read_folio(folio, ntfs_get_block);
}
static void ntfs_readahead(struct readahead_control *rac)
@@ -757,6 +759,7 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t vbo = iocb->ki_pos;
loff_t end;
int wr = iov_iter_rw(iter) & WRITE;
+ size_t iter_count = iov_iter_count(iter);
loff_t valid;
ssize_t ret;
@@ -770,10 +773,13 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
wr ? ntfs_get_block_direct_IO_W
: ntfs_get_block_direct_IO_R);
- if (ret <= 0)
+ if (ret > 0)
+ end = vbo + ret;
+ else if (wr && ret == -EIOCBQUEUED)
+ end = vbo + iter_count;
+ else
goto out;
- end = vbo + ret;
valid = ni->i_valid;
if (wr) {
if (end > valid && !S_ISBLK(inode->i_mode)) {
@@ -846,12 +852,10 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
static int ntfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct inode *inode = mapping->host;
- struct ntfs_inode *ni = ntfs_i(inode);
/* Redirect call to 'ntfs_writepage' for resident files. */
- get_block_t *get_block = is_resident(ni) ? NULL : &ntfs_get_block;
-
- return mpage_writepages(mapping, wbc, get_block);
+ if (is_resident(ntfs_i(mapping->host)))
+ return generic_writepages(mapping, wbc);
+ return mpage_writepages(mapping, wbc, ntfs_get_block);
}
static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
@@ -861,9 +865,8 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
bh_result, create, GET_BLOCK_WRITE_BEGIN);
}
-static int ntfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, u32 flags, struct page **pagep,
- void **fsdata)
+int ntfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, struct page **pagep, void **fsdata)
{
int err;
struct inode *inode = mapping->host;
@@ -872,7 +875,7 @@ static int ntfs_write_begin(struct file *file, struct address_space *mapping,
*pagep = NULL;
if (is_resident(ni)) {
struct page *page = grab_cache_page_write_begin(
- mapping, pos >> PAGE_SHIFT, flags);
+ mapping, pos >> PAGE_SHIFT);
if (!page) {
err = -ENOMEM;
@@ -894,7 +897,7 @@ static int ntfs_write_begin(struct file *file, struct address_space *mapping,
goto out;
}
- err = block_write_begin(mapping, pos, len, flags, pagep,
+ err = block_write_begin(mapping, pos, len, pagep,
ntfs_get_block_write_begin);
out:
@@ -904,10 +907,9 @@ out:
/*
* ntfs_write_end - Address_space_operations::write_end.
*/
-static int ntfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, u32 copied, struct page *page,
- void *fsdata)
-
+int ntfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct page *page,
+ void *fsdata)
{
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
@@ -975,7 +977,7 @@ int reset_log_file(struct inode *inode)
len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE;
- err = block_write_begin(mapping, pos, len, 0, &page,
+ err = block_write_begin(mapping, pos, len, &page,
ntfs_get_block_write_begin);
if (err)
goto out;
@@ -1631,7 +1633,7 @@ out4:
ni->mi.dirty = false;
discard_new_inode(inode);
out3:
- ntfs_mark_rec_free(sbi, ino);
+ ntfs_mark_rec_free(sbi, ino, false);
out2:
__putname(new_de);
@@ -1654,7 +1656,6 @@ int ntfs_link_inode(struct inode *inode, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
struct NTFS_DE *de;
- struct ATTR_FILE_NAME *de_name;
/* Allocate PATH_MAX bytes. */
de = __getname();
@@ -1669,15 +1670,6 @@ int ntfs_link_inode(struct inode *inode, struct dentry *dentry)
if (err)
goto out;
- de_name = (struct ATTR_FILE_NAME *)(de + 1);
- /* Fill duplicate info. */
- de_name->dup.cr_time = de_name->dup.m_time = de_name->dup.c_time =
- de_name->dup.a_time = kernel2nt(&inode->i_ctime);
- de_name->dup.alloc_size = de_name->dup.data_size =
- cpu_to_le64(inode->i_size);
- de_name->dup.fa = ni->std_fa;
- de_name->dup.ea_size = de_name->dup.reparse = 0;
-
err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de);
out:
__putname(de);
@@ -1730,9 +1722,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
if (inode->i_nlink)
mark_inode_dirty(inode);
} else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) {
- make_bad_inode(inode);
- ntfs_inode_err(inode, "failed to undo unlink");
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ _ntfs_bad_inode(inode);
} else {
if (ni_is_dirty(dir))
mark_inode_dirty(dir);
@@ -1937,12 +1927,10 @@ const struct inode_operations ntfs_link_inode_operations = {
.setattr = ntfs3_setattr,
.listxattr = ntfs_listxattr,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
- .set_acl = ntfs_set_acl,
};
const struct address_space_operations ntfs_aops = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
.readahead = ntfs_readahead,
.writepage = ntfs_writepage,
.writepages = ntfs_writepages,
@@ -1951,10 +1939,11 @@ const struct address_space_operations ntfs_aops = {
.direct_IO = ntfs_direct_IO,
.bmap = ntfs_bmap,
.dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
};
const struct address_space_operations ntfs_aops_cmpr = {
- .readpage = ntfs_readpage,
+ .read_folio = ntfs_read_folio,
.readahead = ntfs_readahead,
};
// clang-format on
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index bc741213ad84..bc22cc321a74 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -208,7 +208,7 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
}
/*
- * ntfs_rmdir - inode_operations::rm_dir
+ * ntfs_rmdir - inode_operations::rmdir
*/
static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -308,9 +308,7 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
if (is_bad) {
/* Restore after failed rename failed too. */
- make_bad_inode(inode);
- ntfs_inode_err(inode, "failed to undo rename");
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ _ntfs_bad_inode(inode);
} else if (!err) {
inode->i_ctime = dir->i_ctime = dir->i_mtime =
current_time(dir);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index fb825059d488..2c791222c4e2 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -220,6 +220,7 @@ struct ntfs_sb_info {
u32 flags; // See NTFS_FLAGS_XXX.
+ CLST zone_max; // Maximum MFT zone length in clusters
CLST bad_clusters; // The count of marked bad clusters.
u16 max_bytes_per_attr; // Maximum attribute size in record.
@@ -408,8 +409,6 @@ enum REPARSE_SIGN {
};
/* Functions from attrib.c */
-int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
- struct runs_tree *run, const CLST *vcn);
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
@@ -440,6 +439,7 @@ int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr,
int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
u64 new_valid);
int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
+int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size);
/* Functions from attrlist.c */
@@ -528,7 +528,7 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
const __le16 *name, u8 name_len,
const struct runs_tree *run, CLST svcn, CLST len,
__le16 flags, struct ATTRIB **new_attr,
- struct mft_inode **mi);
+ struct mft_inode **mi, struct ATTR_LIST_ENTRY **le);
int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
enum ATTR_TYPE type, const __le16 *name, u8 name_len,
struct ATTRIB **new_attr, struct mft_inode **mi,
@@ -589,10 +589,12 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
enum ALLOCATE_OPT opt);
int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
struct ntfs_inode *ni, struct mft_inode **mi);
-void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno);
+void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft);
int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to);
int ntfs_refresh_zone(struct ntfs_sb_info *sbi);
-int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait);
+void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait);
+void ntfs_bad_inode(struct inode *inode, const char *hint);
+#define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__)
enum NTFS_DIRTY_FLAGS {
NTFS_DIRTY_CLEAR = 0,
NTFS_DIRTY_DIRTY = 1,
@@ -617,7 +619,7 @@ int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
struct ntfs_buffers *nb, int sync);
int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
struct page **pages, u32 nr_pages, u64 vbo, u32 bytes,
- u32 op);
+ enum req_op op);
int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run);
int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run,
u64 vbo, u64 *lbo, u64 *bytes);
@@ -689,6 +691,11 @@ int ntfs_set_size(struct inode *inode, u64 new_size);
int reset_log_file(struct inode *inode);
int ntfs_get_block(struct inode *inode, sector_t vbn,
struct buffer_head *bh_result, int create);
+int ntfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, struct page **pagep, void **fsdata);
+int ntfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct page *page,
+ void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
@@ -733,7 +740,6 @@ static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec,
int mi_write(struct mft_inode *mi, int wait);
int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
__le16 flags, bool is_mft);
-void mi_mark_free(struct mft_inode *mi);
struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
const __le16 *name, u8 name_len, u32 asize,
u16 name_off);
@@ -775,10 +781,10 @@ bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn,
void run_truncate(struct runs_tree *run, CLST vcn);
void run_truncate_head(struct runs_tree *run, CLST vcn);
void run_truncate_around(struct runs_tree *run, CLST vcn);
-bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *Index);
bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len,
bool is_mft);
bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len);
+bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len);
bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn,
CLST *lcn, CLST *len);
bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn);
@@ -797,6 +803,7 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
#define run_unpack_ex run_unpack
#endif
int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn);
+int run_clone(const struct runs_tree *run, struct runs_tree *new_run);
/* Globals from super.c */
void *ntfs_set_shared(void *ptr, u32 bytes);
@@ -891,13 +898,8 @@ static inline struct page *ntfs_map_page(struct address_space *mapping,
{
struct page *page = read_mapping_page(mapping, index, NULL);
- if (!IS_ERR(page)) {
+ if (!IS_ERR(page))
kmap(page);
- if (!PageError(page))
- return page;
- ntfs_unmap_page(page);
- return ERR_PTR(-EIO);
- }
return page;
}
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 861e35791506..7d2fac5ee215 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -395,28 +395,6 @@ int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
}
/*
- * mi_mark_free - Mark record as unused and marks it as free in bitmap.
- */
-void mi_mark_free(struct mft_inode *mi)
-{
- CLST rno = mi->rno;
- struct ntfs_sb_info *sbi = mi->sbi;
-
- if (rno >= MFT_REC_RESERVED && rno < MFT_REC_FREE) {
- ntfs_clear_mft_tail(sbi, rno, rno + 1);
- mi->dirty = false;
- return;
- }
-
- if (mi->mrec) {
- clear_rec_inuse(mi->mrec);
- mi->dirty = true;
- mi_write(mi, 0);
- }
- ntfs_mark_rec_free(sbi, rno);
-}
-
-/*
* mi_insert_attr - Reserve space for new attribute.
*
* Return: Not full constructed attribute or NULL if not possible to create.
@@ -445,12 +423,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
attr = NULL;
while ((attr = mi_enum_attr(mi, attr))) {
diff = compare_attr(attr, type, name, name_len, upcase);
- if (diff > 0)
- break;
+
if (diff < 0)
continue;
- if (!is_attr_indexed(attr))
+ if (!diff && !is_attr_indexed(attr))
return NULL;
break;
}
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index a8fec651f973..aaaa0d3d35a2 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -31,7 +31,7 @@ struct ntfs_run {
* Case of entry missing from list 'index' will be set to
* point to insertion position for the entry question.
*/
-bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index)
+static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index)
{
size_t min_idx, max_idx, mid_idx;
struct ntfs_run *r;
@@ -547,6 +547,48 @@ bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len)
return true;
}
+/* run_insert_range
+ *
+ * Helper for attr_insert_range(),
+ * which is helper for fallocate(insert_range).
+ */
+bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len)
+{
+ size_t index;
+ struct ntfs_run *r, *e;
+
+ if (WARN_ON(!run_lookup(run, vcn, &index)))
+ return false; /* Should never be here. */
+
+ e = run->runs + run->count;
+ r = run->runs + index;
+
+ if (vcn > r->vcn)
+ r += 1;
+
+ for (; r < e; r++)
+ r->vcn += len;
+
+ r = run->runs + index;
+
+ if (vcn > r->vcn) {
+ /* split fragment. */
+ CLST len1 = vcn - r->vcn;
+ CLST len2 = r->len - len1;
+ CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1);
+
+ r->len = len1;
+
+ if (!run_add_entry(run, vcn + len, lcn2, len2, false))
+ return false;
+ }
+
+ if (!run_add_entry(run, vcn, SPARSE_LCN, len, false))
+ return false;
+
+ return true;
+}
+
/*
* run_get_entry - Return index-th mapped region.
*/
@@ -778,26 +820,36 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
CLST next_vcn, vcn, lcn;
CLST prev_lcn = 0;
CLST evcn1 = svcn + len;
+ const struct ntfs_run *r, *r_end;
int packed_size = 0;
size_t i;
- bool ok;
s64 dlcn;
int offset_size, size_size, tmp;
- next_vcn = vcn = svcn;
-
*packed_vcns = 0;
if (!len)
goto out;
- ok = run_lookup_entry(run, vcn, &lcn, &len, &i);
+ /* Check all required entries [svcn, encv1) available. */
+ if (!run_lookup(run, svcn, &i))
+ return -ENOENT;
+
+ r_end = run->runs + run->count;
+ r = run->runs + i;
- if (!ok)
- goto error;
+ for (next_vcn = r->vcn + r->len; next_vcn < evcn1;
+ next_vcn = r->vcn + r->len) {
+ if (++r >= r_end || r->vcn != next_vcn)
+ return -ENOENT;
+ }
- if (next_vcn != vcn)
- goto error;
+ /* Repeat cycle above and pack runs. Assume no errors. */
+ r = run->runs + i;
+ len = svcn - r->vcn;
+ vcn = svcn;
+ lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len);
+ len = r->len - len;
for (;;) {
next_vcn = vcn + len;
@@ -846,12 +898,10 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1)
goto out;
- ok = run_get_entry(run, ++i, &vcn, &lcn, &len);
- if (!ok)
- goto error;
-
- if (next_vcn != vcn)
- goto error;
+ r += 1;
+ vcn = r->vcn;
+ lcn = r->lcn;
+ len = r->len;
}
out:
@@ -860,9 +910,6 @@ out:
run_buf[0] = 0;
return packed_size + 1;
-
-error:
- return -EOPNOTSUPP;
}
/*
@@ -1109,3 +1156,28 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn)
*highest_vcn = vcn64 - 1;
return 0;
}
+
+/*
+ * run_clone
+ *
+ * Make a copy of run
+ */
+int run_clone(const struct runs_tree *run, struct runs_tree *new_run)
+{
+ size_t bytes = run->count * sizeof(struct ntfs_run);
+
+ if (bytes > new_run->allocated) {
+ struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL);
+
+ if (!new_ptr)
+ return -ENOMEM;
+
+ kvfree(new_run->runs);
+ new_run->runs = new_ptr;
+ new_run->allocated = bytes;
+ }
+
+ memcpy(new_run->runs, run->runs, bytes);
+ new_run->count = run->count;
+ return 0;
+}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 278dcf502410..47012c9bf505 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -30,6 +30,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/log2.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/nls.h>
#include <linux/seq_file.h>
@@ -390,7 +391,7 @@ static int ntfs_fs_reconfigure(struct fs_context *fc)
return -EINVAL;
}
- memcpy(sbi->options, new_opts, sizeof(*new_opts));
+ swap(sbi->options, fc->fs_private);
return 0;
}
@@ -668,9 +669,11 @@ static u32 format_size_gb(const u64 bytes, u32 *mb)
static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
{
- return boot->sectors_per_clusters <= 0x80
- ? boot->sectors_per_clusters
- : (1u << (0 - boot->sectors_per_clusters));
+ if (boot->sectors_per_clusters <= 0x80)
+ return boot->sectors_per_clusters;
+ if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
+ return 1U << (0 - boot->sectors_per_clusters);
+ return -EINVAL;
}
/*
@@ -713,6 +716,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
/* cluster size: 512, 1K, 2K, 4K, ... 2M */
sct_per_clst = true_sectors_per_clst(boot);
+ if ((int)sct_per_clst < 0)
+ goto out;
if (!is_power_of_2(sct_per_clst))
goto out;
@@ -866,6 +871,13 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
#endif
+ /*
+ * Compute the MFT zone at two steps.
+ * It would be nice if we are able to allocate 1/8 of
+ * total clusters for MFT but not more then 512 MB.
+ */
+ sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3);
+
err = 0;
out:
@@ -882,7 +894,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
int err;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct block_device *bdev = sb->s_bdev;
- struct request_queue *rq;
struct inode *inode;
struct ntfs_inode *ni;
size_t i, tt;
@@ -897,6 +908,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
ref.high = 0;
sbi->sb = sb;
+ sbi->options = fc->fs_private;
+ fc->fs_private = NULL;
sb->s_flags |= SB_NODIRATIME;
sb->s_magic = 0x7366746e; // "ntfs"
sb->s_op = &ntfs_sops;
@@ -912,15 +925,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto out;
}
- rq = bdev_get_queue(bdev);
- if (blk_queue_discard(rq) && rq->limits.discard_granularity) {
- sbi->discard_granularity = rq->limits.discard_granularity;
+ if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) {
+ sbi->discard_granularity = bdev_discard_granularity(bdev);
sbi->discard_granularity_mask_inv =
~(u64)(sbi->discard_granularity - 1);
}
/* Parse boot. */
- err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512,
+ err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev),
bdev_nr_bytes(bdev));
if (err)
goto out;
@@ -1260,8 +1272,6 @@ load_root:
goto put_inode_out;
}
- fc->fs_private = NULL;
-
return 0;
put_inode_out:
@@ -1335,7 +1345,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
return 0;
err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9,
- GFP_NOFS, 0);
+ GFP_NOFS);
if (err == -EOPNOTSUPP)
sbi->flags |= NTFS_FLAGS_NODISCARD;
@@ -1376,7 +1386,7 @@ static const struct fs_context_operations ntfs_context_ops = {
/*
* ntfs_init_fs_context - Initialize spi and opts
*
- * This will called when mount/remount. We will first initiliaze
+ * This will called when mount/remount. We will first initialize
* options so that if remount we can use just that.
*/
static int ntfs_init_fs_context(struct fs_context *fc)
@@ -1414,7 +1424,6 @@ static int ntfs_init_fs_context(struct fs_context *fc)
mutex_init(&sbi->compress.mtx_lzx);
#endif
- sbi->options = opts;
fc->s_fs_info = sbi;
ok:
fc->fs_private = opts;
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index afd0ddad826f..7de8718c68a9 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -112,13 +112,13 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
return -ENOMEM;
if (!size) {
- ;
+ /* EA info persists, but xattr is empty. Looks like EA problem. */
} else if (attr_ea->non_res) {
struct runs_tree run;
run_init(&run);
- err = attr_load_runs(attr_ea, ni, &run, NULL);
+ err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &run, 0, size);
if (!err)
err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL);
run_close(&run);
@@ -259,7 +259,7 @@ out:
static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_t name_len, const void *value,
- size_t val_size, int flags)
+ size_t val_size, int flags, bool locked)
{
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -278,7 +278,8 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
u64 new_sz;
void *p;
- ni_lock(ni);
+ if (!locked)
+ ni_lock(ni);
run_init(&ea_run);
@@ -443,6 +444,11 @@ update_ea:
/* Delete xattr, ATTR_EA */
ni_remove_attr_le(ni, attr, mi, le);
} else if (attr->non_res) {
+ err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &ea_run, 0,
+ size);
+ if (err)
+ goto out;
+
err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0);
if (err)
goto out;
@@ -467,7 +473,8 @@ update_ea:
mark_inode_dirty(&ni->vfs_inode);
out:
- ni_unlock(ni);
+ if (!locked)
+ ni_unlock(ni);
run_close(&ea_run);
kfree(ea_all);
@@ -476,8 +483,7 @@ out:
}
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
- struct inode *inode, int type,
+static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
int locked)
{
struct ntfs_inode *ni = ntfs_i(inode);
@@ -512,7 +518,7 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns,
/* Translate extended attribute to acl. */
if (err >= 0) {
- acl = posix_acl_from_xattr(mnt_userns, buf, err);
+ acl = posix_acl_from_xattr(&init_user_ns, buf, err);
} else if (err == -ENODATA) {
acl = NULL;
} else {
@@ -535,37 +541,32 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
if (rcu)
return ERR_PTR(-ECHILD);
- /* TODO: init_user_ns? */
- return ntfs_get_acl_ex(&init_user_ns, inode, type, 0);
+ return ntfs_get_acl_ex(inode, type, 0);
}
static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
struct inode *inode, struct posix_acl *acl,
- int type)
+ int type, bool init_acl)
{
const char *name;
size_t size, name_len;
- void *value = NULL;
- int err = 0;
+ void *value;
+ int err;
int flags;
+ umode_t mode;
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
+ mode = inode->i_mode;
switch (type) {
case ACL_TYPE_ACCESS:
- if (acl) {
- umode_t mode = inode->i_mode;
-
+ /* Do not change i_mode if we are in init_acl */
+ if (acl && !init_acl) {
err = posix_acl_update_mode(mnt_userns, inode, &mode,
&acl);
if (err)
- goto out;
-
- if (inode->i_mode != mode) {
- inode->i_mode = mode;
- mark_inode_dirty(inode);
- }
+ return err;
}
name = XATTR_NAME_POSIX_ACL_ACCESS;
name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1;
@@ -592,17 +593,22 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
value = kmalloc(size, GFP_NOFS);
if (!value)
return -ENOMEM;
- err = posix_acl_to_xattr(mnt_userns, acl, value, size);
+ err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
if (err < 0)
goto out;
flags = 0;
}
- err = ntfs_set_ea(inode, name, name_len, value, size, flags);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
if (err == -ENODATA && !size)
err = 0; /* Removing non existed xattr. */
- if (!err)
+ if (!err) {
set_cached_acl(inode, type, acl);
+ if (inode->i_mode != mode) {
+ inode->i_mode = mode;
+ mark_inode_dirty(inode);
+ }
+ }
out:
kfree(value);
@@ -616,7 +622,7 @@ out:
int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type)
{
- return ntfs_set_acl_ex(mnt_userns, inode, acl, type);
+ return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
}
/*
@@ -636,19 +642,19 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (default_acl) {
err = ntfs_set_acl_ex(mnt_userns, inode, default_acl,
- ACL_TYPE_DEFAULT);
+ ACL_TYPE_DEFAULT, true);
posix_acl_release(default_acl);
} else {
inode->i_default_acl = NULL;
}
- if (!acl)
- inode->i_acl = NULL;
- else {
+ if (acl) {
if (!err)
err = ntfs_set_acl_ex(mnt_userns, inode, acl,
- ACL_TYPE_ACCESS);
+ ACL_TYPE_ACCESS, true);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return err;
@@ -898,9 +904,12 @@ set_new_fa:
}
/* Deal with NTFS extended attribute. */
- err = ntfs_set_ea(inode, name, name_len, value, size, flags);
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
out:
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
+
return err;
}
@@ -913,35 +922,37 @@ int ntfs_save_wsl_perm(struct inode *inode)
{
int err;
__le32 value;
+ struct ntfs_inode *ni = ntfs_i(inode);
- /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */
+ ni_lock(ni);
value = cpu_to_le32(i_uid_read(inode));
err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true); /* true == already locked. */
if (err)
goto out;
value = cpu_to_le32(i_gid_read(inode));
err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true);
if (err)
goto out;
value = cpu_to_le32(inode->i_mode);
err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true);
if (err)
goto out;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
value = cpu_to_le32(inode->i_rdev);
err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value,
- sizeof(value), 0);
+ sizeof(value), 0, true);
if (err)
goto out;
}
out:
+ ni_unlock(ni);
/* In case of error should we delete all WSL xattr? */
return err;
}
@@ -981,7 +992,7 @@ static bool ntfs_xattr_user_list(struct dentry *dentry)
}
// clang-format off
-static const struct xattr_handler ntfs_xattr_handler = {
+static const struct xattr_handler ntfs_other_xattr_handler = {
.prefix = "",
.get = ntfs_getxattr,
.set = ntfs_setxattr,
@@ -989,7 +1000,11 @@ static const struct xattr_handler ntfs_xattr_handler = {
};
const struct xattr_handler *ntfs_xattr_handlers[] = {
- &ntfs_xattr_handler,
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &ntfs_other_xattr_handler,
NULL,
};
// clang-format on
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 49f41074baad..51c93929a146 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7427,7 +7427,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
/*
* No need to worry about the data page here - it's been
* truncated already and inline data doesn't need it for
- * pushing zero's to disk, so we'll let readpage pick it up
+ * pushing zero's to disk, so we'll let read_folio pick it up
* later.
*/
if (trunc) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4b9af65cb61b..af4157f61927 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -275,17 +275,16 @@ out:
return ret;
}
-static int ocfs2_readpage(struct file *file, struct page *page)
+static int ocfs2_read_folio(struct file *file, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- loff_t start = (loff_t)page->index << PAGE_SHIFT;
+ loff_t start = folio_pos(folio);
int ret, unlock = 1;
- trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
- (page ? page->index : 0));
+ trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index);
- ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
+ ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page);
if (ret != 0) {
if (ret == AOP_TRUNCATED_PAGE)
unlock = 0;
@@ -295,11 +294,11 @@ static int ocfs2_readpage(struct file *file, struct page *page)
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
/*
- * Unlock the page and cycle ip_alloc_sem so that we don't
+ * Unlock the folio and cycle ip_alloc_sem so that we don't
* busyloop waiting for ip_alloc_sem to unlock
*/
ret = AOP_TRUNCATED_PAGE;
- unlock_page(page);
+ folio_unlock(folio);
unlock = 0;
down_read(&oi->ip_alloc_sem);
up_read(&oi->ip_alloc_sem);
@@ -309,24 +308,24 @@ static int ocfs2_readpage(struct file *file, struct page *page)
/*
* i_size might have just been updated as we grabed the meta lock. We
* might now be discovering a truncate that hit on another node.
- * block_read_full_page->get_block freaks out if it is asked to read
+ * block_read_full_folio->get_block freaks out if it is asked to read
* beyond the end of a file, so we check here. Callers
* (generic_file_read, vm_ops->fault) are clever enough to check i_size
- * and notice that the page they just read isn't needed.
+ * and notice that the folio they just read isn't needed.
*
* XXX sys_readahead() seems to get that wrong?
*/
if (start >= i_size_read(inode)) {
- zero_user(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
ret = 0;
goto out_alloc;
}
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- ret = ocfs2_readpage_inline(inode, page);
+ ret = ocfs2_readpage_inline(inode, &folio->page);
else
- ret = block_read_full_page(page, ocfs2_get_block);
+ ret = block_read_full_folio(folio, ocfs2_get_block);
unlock = 0;
out_alloc:
@@ -335,7 +334,7 @@ out_inode_unlock:
ocfs2_inode_unlock(inode, 0);
out:
if (unlock)
- unlock_page(page);
+ folio_unlock(folio);
return ret;
}
@@ -497,11 +496,11 @@ bail:
return status;
}
-static int ocfs2_releasepage(struct page *page, gfp_t wait)
+static bool ocfs2_release_folio(struct folio *folio, gfp_t wait)
{
- if (!page_has_buffers(page))
- return 0;
- return try_to_free_buffers(page);
+ if (!folio_buffers(folio))
+ return false;
+ return try_to_free_buffers(folio);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -637,7 +636,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
!buffer_new(bh) &&
ocfs2_should_read_blk(inode, page, block_start) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
*wait_bh++=bh;
}
@@ -1881,7 +1880,7 @@ out:
}
static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
@@ -1897,7 +1896,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
/*
* Take alloc sem here to prevent concurrent lookups. That way
* the mapping, zeroing and tree manipulation within
- * ocfs2_write() will be safe against ->readpage(). This
+ * ocfs2_write() will be safe against ->read_folio(). This
* should also serve to lock out allocation from a shared
* writeable region.
*/
@@ -2454,7 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
const struct address_space_operations ocfs2_aops = {
.dirty_folio = block_dirty_folio,
- .readpage = ocfs2_readpage,
+ .read_folio = ocfs2_read_folio,
.readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
.write_begin = ocfs2_write_begin,
@@ -2462,8 +2461,8 @@ const struct address_space_operations ocfs2_aops = {
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
.invalidate_folio = block_invalidate_folio,
- .releasepage = ocfs2_releasepage,
- .migratepage = buffer_migrate_page,
+ .release_folio = ocfs2_release_folio,
+ .migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index e7758778abef..196638a22b48 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -64,7 +64,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
wait_on_buffer(bh);
@@ -147,7 +147,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
get_bh(bh); /* for end_buffer_read_sync() */
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
}
read_failure:
@@ -328,7 +328,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
if (validate)
set_buffer_needs_validate(bh);
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, 0, bh);
+ submit_bh(REQ_OP_READ, bh);
continue;
}
}
@@ -449,7 +449,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
wait_on_buffer(bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ea0e70c0fce0..b13d344d40b6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -235,8 +235,6 @@ struct o2hb_region {
* (hr_steady_iterations == 0) within hr_unsteady_iterations */
atomic_t hr_unsteady_iterations;
- char hr_dev_name[BDEVNAME_SIZE];
-
unsigned int hr_timeout_ms;
/* randomized as the region goes up and down so that a node
@@ -287,8 +285,8 @@ static void o2hb_write_timeout(struct work_struct *work)
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
- mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
- "milliseconds\n", reg->hr_dev_name,
+ mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u "
+ "milliseconds\n", reg->hr_bdev,
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
@@ -383,9 +381,9 @@ static void o2hb_nego_timeout(struct work_struct *work)
if (master_node == o2nm_this_node()) {
if (!test_bit(master_node, reg->hr_nego_node_bitmap)) {
- printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n",
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n",
o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000,
- config_item_name(&reg->hr_item), reg->hr_dev_name);
+ config_item_name(&reg->hr_item), reg->hr_bdev);
set_bit(master_node, reg->hr_nego_node_bitmap);
}
if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
@@ -399,8 +397,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
return;
}
- printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n",
- config_item_name(&reg->hr_item), reg->hr_dev_name);
+ printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n",
+ config_item_name(&reg->hr_item), reg->hr_bdev);
/* approve negotiate timeout request. */
o2hb_arm_timeout(reg);
@@ -419,9 +417,9 @@ static void o2hb_nego_timeout(struct work_struct *work)
}
} else {
/* negotiate timeout with master node. */
- printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n",
+ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n",
o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(&reg->hr_item),
- reg->hr_dev_name, master_node);
+ reg->hr_bdev, master_node);
ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG,
master_node);
if (ret)
@@ -437,8 +435,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data,
struct o2hb_nego_msg *nego_msg;
nego_msg = (struct o2hb_nego_msg *)msg->buf;
- printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n",
- nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_dev_name);
+ printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n",
+ nego_msg->node_num, config_item_name(&reg->hr_item), reg->hr_bdev);
if (nego_msg->node_num < O2NM_MAX_NODES)
set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap);
else
@@ -452,8 +450,8 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data,
{
struct o2hb_region *reg = data;
- printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n",
- config_item_name(&reg->hr_item), reg->hr_dev_name);
+ printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n",
+ config_item_name(&reg->hr_item), reg->hr_bdev);
o2hb_arm_timeout(reg);
return 0;
}
@@ -503,8 +501,7 @@ static void o2hb_bio_end_io(struct bio *bio)
static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
struct o2hb_bio_wait_ctxt *wc,
unsigned int *current_slot,
- unsigned int max_slots, int op,
- int op_flags)
+ unsigned int max_slots, blk_opf_t opf)
{
int len, current_page;
unsigned int vec_len, vec_start;
@@ -518,7 +515,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
* GFP_KERNEL that the local node can get fenced. It would be
* nicest if we could pre-allocate these bios and avoid this
* all together. */
- bio = bio_alloc(reg->hr_bdev, 16, op | op_flags, GFP_ATOMIC);
+ bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC);
if (!bio) {
mlog(ML_ERROR, "Could not alloc slots BIO!\n");
bio = ERR_PTR(-ENOMEM);
@@ -566,7 +563,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
while(current_slot < max_slots) {
bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots,
- REQ_OP_READ, 0);
+ REQ_OP_READ);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -598,8 +595,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
slot = o2nm_this_node();
- bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE,
- REQ_SYNC);
+ bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1,
+ REQ_OP_WRITE | REQ_SYNC);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -689,8 +686,8 @@ static int o2hb_check_own_slot(struct o2hb_region *reg)
else
errstr = ERRSTR3;
- mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), "
- "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name,
+ mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), "
+ "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev,
slot->ds_node_num, (unsigned long long)slot->ds_last_generation,
(unsigned long long)slot->ds_last_time, hb_block->hb_node,
(unsigned long long)le64_to_cpu(hb_block->hb_generation),
@@ -863,8 +860,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
sizeof(o2hb_live_node_bitmap)))
goto unlock;
- printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
- config_item_name(&reg->hr_item), reg->hr_dev_name);
+ printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
+ config_item_name(&reg->hr_item), reg->hr_bdev);
set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
@@ -922,8 +919,8 @@ static int o2hb_check_slot(struct o2hb_region *reg,
/* The node is live but pushed out a bad crc. We
* consider it a transient miss but don't populate any
* other values as they may be junk. */
- mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
- slot->ds_node_num, reg->hr_dev_name);
+ mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n",
+ slot->ds_node_num, reg->hr_bdev);
o2hb_dump_slot(hb_block);
slot->ds_equal_samples++;
@@ -1002,11 +999,11 @@ fire_callbacks:
slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
if (slot_dead_ms && slot_dead_ms != dead_ms) {
/* TODO: Perhaps we can fail the region here. */
- mlog(ML_ERROR, "Node %d on device %s has a dead count "
+ mlog(ML_ERROR, "Node %d on device %pg has a dead count "
"of %u ms, but our count is %u ms.\n"
"Please double check your configuration values "
"for 'O2CB_HEARTBEAT_THRESHOLD'\n",
- slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
+ slot->ds_node_num, reg->hr_bdev, slot_dead_ms,
dead_ms);
}
goto out;
@@ -1145,8 +1142,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
/* Do not re-arm the write timeout on I/O error - we
* can't be sure that the new block ever made it to
* disk */
- mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
- write_wc.wc_error, reg->hr_dev_name);
+ mlog(ML_ERROR, "Write error %d on device \"%pg\"\n",
+ write_wc.wc_error, reg->hr_bdev);
ret = write_wc.wc_error;
goto bail;
}
@@ -1170,9 +1167,9 @@ bail:
if (atomic_read(&reg->hr_steady_iterations) != 0) {
if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
printk(KERN_NOTICE "o2hb: Unable to stabilize "
- "heartbeat on region %s (%s)\n",
+ "heartbeat on region %s (%pg)\n",
config_item_name(&reg->hr_item),
- reg->hr_dev_name);
+ reg->hr_bdev);
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
wake_up(&o2hb_steady_queue);
@@ -1494,7 +1491,7 @@ static void o2hb_region_release(struct config_item *item)
struct page *page;
struct o2hb_region *reg = to_o2hb_region(item);
- mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+ mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev);
kfree(reg->hr_tmp_block);
@@ -1641,7 +1638,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
unsigned int ret = 0;
if (to_o2hb_region(item)->hr_bdev)
- ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name);
+ ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev);
return ret;
}
@@ -1798,8 +1795,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
goto out2;
}
- bdevname(reg->hr_bdev, reg->hr_dev_name);
-
sectsize = bdev_logical_block_size(reg->hr_bdev);
if (sectsize != reg->hr_block_bytes) {
mlog(ML_ERROR,
@@ -1895,8 +1890,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
ret = -EIO;
if (hb_task && o2hb_global_heartbeat_active())
- printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
- config_item_name(&reg->hr_item), reg->hr_dev_name);
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n",
+ config_item_name(&reg->hr_item), reg->hr_bdev);
out3:
if (ret < 0) {
@@ -2088,10 +2083,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
quorum_region = 1;
clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
spin_unlock(&o2hb_live_lock);
- printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+ printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n",
((atomic_read(&reg->hr_steady_iterations) == 0) ?
"stopped" : "start aborted"), config_item_name(item),
- reg->hr_dev_name);
+ reg->hr_bdev);
}
/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 81c3d65d68fe..694471fc46b8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2032,7 +2032,7 @@ struct ocfs2_empty_dir_priv {
unsigned seen_other;
unsigned dx_dir;
};
-static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
+static bool ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
int name_len, loff_t pos, u64 ino,
unsigned type)
{
@@ -2052,7 +2052,7 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
*/
if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
p->seen_dot = 1;
- return 0;
+ return true;
}
if (name_len == 2 && !strncmp("..", name, 2) &&
@@ -2060,13 +2060,13 @@ static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
p->seen_dot_dot = 1;
if (p->dx_dir && p->seen_dot)
- return 1;
+ return false;
- return 0;
+ return true;
}
p->seen_other = 1;
- return 1;
+ return false;
}
static int ocfs2_empty_dir_dx(struct inode *inode,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index d442cf5dda8a..be5e9ed7da8d 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -541,7 +541,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
struct debug_lockres *dl = m->private;
struct dlm_ctxt *dlm = dl->dl_ctxt;
struct dlm_lock_resource *oldres = dl->dl_res;
- struct dlm_lock_resource *res = NULL;
+ struct dlm_lock_resource *res = NULL, *iter;
struct list_head *track_list;
spin_lock(&dlm->track_lock);
@@ -556,11 +556,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
}
}
- list_for_each_entry(res, track_list, tracking) {
- if (&res->tracking == &dlm->tracking_list)
- res = NULL;
- else
- dlm_lockres_get(res);
+ list_for_each_entry(iter, track_list, tracking) {
+ if (&iter->tracking != &dlm->tracking_list) {
+ dlm_lockres_get(iter);
+ res = iter;
+ }
break;
}
spin_unlock(&dlm->track_lock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 61103b2d69fb..7318e4794ef9 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -392,9 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock = NULL, *iter;
enum dlm_status status = DLM_NORMAL;
- int found = 0, i;
+ int i;
struct dlm_lockstatus *lksb = NULL;
int ignore;
u32 flags;
@@ -437,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
queue=&res->granted;
- found = 0;
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
spin_unlock(&res->spinlock);
@@ -461,21 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
for (i=0; i<3; i++) {
- list_for_each_entry(lock, queue, list) {
- if (lock->ml.cookie == unlock->cookie &&
- lock->ml.node == unlock->node_idx) {
- dlm_lock_get(lock);
- found = 1;
+ list_for_each_entry(iter, queue, list) {
+ if (iter->ml.cookie == unlock->cookie &&
+ iter->ml.node == unlock->node_idx) {
+ dlm_lock_get(iter);
+ lock = iter;
break;
}
}
- if (found)
+ if (lock)
break;
/* scan granted -> converting -> blocked queues */
queue++;
}
spin_unlock(&res->spinlock);
- if (!found) {
+ if (!lock) {
status = DLM_IVLOCKID;
goto not_found;
}
@@ -505,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_kick_thread(dlm, res);
not_found:
- if (!found)
+ if (!lock)
mlog(ML_ERROR, "failed to find lock to unlock! "
"cookie=%u:%llu\n",
dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index e360543ad7e7..8b2020f92b5f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -296,17 +296,25 @@ static void dlmfs_evict_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
+ struct user_lock_res *lockres;
+ int teardown;
clear_inode(inode);
mlog(0, "inode %lu\n", inode->i_ino);
ip = DLMFS_I(inode);
+ lockres = &ip->ip_lockres;
if (S_ISREG(inode->i_mode)) {
- status = user_dlm_destroy_lock(&ip->ip_lockres);
- if (status < 0)
- mlog_errno(status);
+ spin_lock(&lockres->l_lock);
+ teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN);
+ spin_unlock(&lockres->l_lock);
+ if (!teardown) {
+ status = user_dlm_destroy_lock(lockres);
+ if (status < 0)
+ mlog_errno(status);
+ }
iput(ip->ip_parent);
goto clear_fields;
}
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 29f183a15798..617c92e7b925 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -433,6 +433,11 @@ again:
}
spin_lock(&lockres->l_lock);
+ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+ spin_unlock(&lockres->l_lock);
+ status = -EAGAIN;
+ goto bail;
+ }
/* We only compare against the currently granted level
* here. If the lock is blocked waiting on a downconvert,
@@ -595,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
spin_unlock(&lockres->l_lock);
- return 0;
+ goto bail;
}
lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
@@ -609,22 +614,30 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
}
if (lockres->l_ro_holders || lockres->l_ex_holders) {
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
spin_unlock(&lockres->l_lock);
goto bail;
}
status = 0;
if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+ /*
+ * lock is never requested, leave USER_LOCK_IN_TEARDOWN set
+ * to avoid new lock request coming in.
+ */
spin_unlock(&lockres->l_lock);
goto bail;
}
- lockres->l_flags &= ~USER_LOCK_ATTACHED;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
if (status) {
+ spin_lock(&lockres->l_lock);
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
+ lockres->l_flags &= ~USER_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto bail;
}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 801e60bab955..c28bc983a7b1 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3403,10 +3403,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
- ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
- osb->cconn = NULL;
+ if (osb->cconn) {
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
- ocfs2_dlm_shutdown_debug(osb);
+ ocfs2_dlm_shutdown_debug(osb);
+ }
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 01b7407a8893..9c67edd215d5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (status)
return status;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
status = dquot_initialize(inode);
if (status)
return status;
@@ -2526,7 +2526,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
return -EOPNOTSUPP;
/*
- * buffered reads protect themselves in ->readpage(). O_DIRECT reads
+ * buffered reads protect themselves in ->read_folio(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
if (direct_io) {
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 9099d8fc7599..22da768e65b7 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -2,12 +2,13 @@
/*
* heartbeat.c
*
- * Register ourselves with the heartbaet service, keep our node maps
+ * Register ourselves with the heartbeat service, keep our node maps
* up to date, and fire off recovery when needed.
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*/
+#include <linux/bitmap.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
@@ -24,18 +25,12 @@
#include "buffer_head_io.h"
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
- int bit);
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
- int bit);
-
/* special case -1 for now
* TODO: should *really* make sure the calling func never passes -1!! */
static void ocfs2_node_map_init(struct ocfs2_node_map *map)
{
map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
- memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
- sizeof(unsigned long));
+ bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES);
}
void ocfs2_init_node_maps(struct ocfs2_super *osb)
@@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data)
ocfs2_recovery_thread(osb, node_num);
}
-static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
- int bit)
-{
- set_bit(bit, map->map);
-}
-
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
@@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_set_bit(map, bit);
+ set_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
}
-static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
- int bit)
-{
- clear_bit(bit, map->map);
-}
-
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
struct ocfs2_node_map *map,
int bit)
@@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
return;
BUG_ON(bit >= map->num_nodes);
spin_lock(&osb->node_map_lock);
- __ocfs2_node_map_clear_bit(map, bit);
+ clear_bit(bit, map->map);
spin_unlock(&osb->node_map_lock);
}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 5739dc301569..bb116c39b581 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -125,6 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
+ journal_t *journal = osb->journal->j_journal;
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
sysfile_type);
@@ -171,11 +172,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
* part of the transaction - the inode could have been reclaimed and
* now it is reread from disk.
*/
- if (osb->journal) {
+ if (journal) {
transaction_t *transaction;
tid_t tid;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- journal_t *journal = osb->journal->j_journal;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f59461d85da4..afd54ec66103 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -903,20 +903,19 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(sb->s_bdev))
return -EOPNOTSUPP;
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
- range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev),
range.minlen);
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 1887a2708709..126671e6caed 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -810,22 +810,20 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
write_unlock(&journal->j_state_lock);
}
-int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
+/*
+ * alloc & initialize skeleton for journal structure.
+ * ocfs2_journal_init() will make fs have journal ability.
+ */
+int ocfs2_journal_alloc(struct ocfs2_super *osb)
{
- int status = -1;
- struct inode *inode = NULL; /* the journal inode */
- journal_t *j_journal = NULL;
- struct ocfs2_journal *journal = NULL;
- struct ocfs2_dinode *di = NULL;
- struct buffer_head *bh = NULL;
- int inode_lock = 0;
+ int status = 0;
+ struct ocfs2_journal *journal;
- /* initialize our journal structure */
journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
if (!journal) {
mlog(ML_ERROR, "unable to alloc journal\n");
status = -ENOMEM;
- goto done;
+ goto bail;
}
osb->journal = journal;
journal->j_osb = osb;
@@ -839,6 +837,21 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
journal->j_state = OCFS2_JOURNAL_FREE;
+bail:
+ return status;
+}
+
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
+{
+ int status = -1;
+ struct inode *inode = NULL; /* the journal inode */
+ journal_t *j_journal = NULL;
+ struct ocfs2_journal *journal = osb->journal;
+ struct ocfs2_dinode *di = NULL;
+ struct buffer_head *bh = NULL;
+ int inode_lock = 0;
+
+ BUG_ON(!journal);
/* already have the inode for our journal */
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
osb->slot_num);
@@ -2044,7 +2057,7 @@ struct ocfs2_orphan_filldir_priv {
enum ocfs2_orphan_reco_type orphan_reco_type;
};
-static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
+static bool ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
int name_len, loff_t pos, u64 ino,
unsigned type)
{
@@ -2053,21 +2066,21 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
struct inode *iter;
if (name_len == 1 && !strncmp(".", name, 1))
- return 0;
+ return true;
if (name_len == 2 && !strncmp("..", name, 2))
- return 0;
+ return true;
/* do not include dio entry in case of orphan scan */
if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
(!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
OCFS2_DIO_ORPHAN_PREFIX_LEN)))
- return 0;
+ return true;
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
- return 0;
+ return true;
if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
OCFS2_DIO_ORPHAN_PREFIX_LEN))
@@ -2077,7 +2090,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
* happen concurrently with unlink/rename */
if (OCFS2_I(iter)->ip_next_orphan) {
iput(iter);
- return 0;
+ return true;
}
trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
@@ -2086,7 +2099,7 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
OCFS2_I(iter)->ip_next_orphan = p->head;
p->head = iter;
- return 0;
+ return true;
}
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 8dcb2f2cadbc..969d0aa28718 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -154,6 +154,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
*
+ * ocfs2_journal_alloc - Initialize skeleton for journal structure.
* ocfs2_journal_init - Initialize journal structures in the OSB.
* ocfs2_journal_load - Load the given journal off disk. Replay it if
* there's transactions still in there.
@@ -167,6 +168,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
*/
void ocfs2_set_journal_params(struct ocfs2_super *osb);
+int ocfs2_journal_alloc(struct ocfs2_super *osb);
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c75fd54b9185..961d1cf54388 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -197,6 +197,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
* callers. */
if (S_ISDIR(mode))
set_nlink(inode, 2);
+ mode = mode_strip_sgid(&init_user_ns, dir, mode);
inode_init_owner(&init_user_ns, inode, dir, mode);
status = dquot_initialize(inode);
if (status)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 337527571461..740b64238312 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -277,7 +277,6 @@ enum ocfs2_mount_options
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
- OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
- return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)
- || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER));
+ return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}
static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 0b6f551a342a..dc9f76ab7e13 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -412,7 +412,7 @@ out_unlock:
goto out_err;
}
-/* Write information to global quota file. Expects exlusive lock on quota
+/* Write information to global quota file. Expects exclusive lock on quota
* file inode and quota info */
static int __ocfs2_global_write_info(struct super_block *sb, int type)
{
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b1a8b046f4c2..5022b3e9bfcd 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -921,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
{
struct mem_dqinfo *info = sb_dqinfo(sb, type);
struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
- struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_quota_chunk *chunk = NULL, *iter;
struct ocfs2_local_disk_chunk *dchunk;
int found = 0, len;
- list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) {
dchunk = (struct ocfs2_local_disk_chunk *)
- chunk->qc_headerbh->b_data;
+ iter->qc_headerbh->b_data;
if (le32_to_cpu(dchunk->dqc_free) > 0) {
- found = 1;
+ chunk = iter;
break;
}
}
- if (!found)
+ if (!chunk)
return NULL;
if (chunk->qc_num < oinfo->dqi_chunks - 1) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7f6355cbb587..1358981e80a3 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2961,12 +2961,14 @@ retry:
}
if (!PageUptodate(page)) {
- ret = block_read_full_page(page, ocfs2_get_block);
+ struct folio *folio = page_folio(page);
+
+ ret = block_read_full_folio(folio, ocfs2_get_block);
if (ret) {
mlog_errno(ret);
goto unlock;
}
- lock_page(page);
+ folio_lock(folio);
}
if (page_has_buffers(page)) {
@@ -3144,48 +3146,18 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters)
{
- int ret = 0;
- loff_t offset, end, map_end;
- pgoff_t page_index;
- struct page *page;
+ int ret;
+ loff_t start, end;
if (ocfs2_should_order_data(inode))
return 0;
- offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
- end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
+ start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
+ end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1;
- ret = filemap_fdatawrite_range(inode->i_mapping,
- offset, end - 1);
- if (ret < 0) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret < 0)
mlog_errno(ret);
- return ret;
- }
-
- while (offset < end) {
- page_index = offset >> PAGE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
- if (map_end > end)
- map_end = end;
-
- page = find_or_create_page(inode->i_mapping,
- page_index, GFP_NOFS);
- BUG_ON(!page);
-
- wait_on_page_writeback(page);
- if (PageError(page)) {
- ret = -EIO;
- mlog_errno(ret);
- } else
- mark_page_accessed(page);
-
- unlock_page(page);
- put_page(page);
- page = NULL;
- offset = map_end;
- if (ret)
- break;
- }
return ret;
}
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 769e466887b0..a9d1296d736d 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -198,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
resv->r_flags |= flags;
}
-int ocfs2_resmap_init(struct ocfs2_super *osb,
+void ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap)
{
memset(resmap, 0, sizeof(*resmap));
@@ -207,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb,
resmap->m_reservations = RB_ROOT;
/* m_bitmap_len is initialized to zero by the above memset. */
INIT_LIST_HEAD(&resmap->m_lru);
-
- return 0;
}
static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 677c50663595..ec8101ef5717 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -73,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
/**
* ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @osb: struct ocfs2_super to be saved in resmap
* @resmap: struct ocfs2_reservation_map to initialize
- * @obj: unused for now
- * @ops: unused for now
- * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
- *
- * Only possible return value other than '0' is -ENOMEM for failure to
- * allocation mirror bitmap.
*/
-int ocfs2_resmap_init(struct ocfs2_super *osb,
+void ocfs2_resmap_init(struct ocfs2_super *osb,
struct ocfs2_reservation_map *resmap);
/**
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 0b0ae3ebb0cf..da7718cef735 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
- if (!si->si_slots[preferred].sl_valid ||
- !si->si_slots[preferred].sl_node_num) {
+ if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
- if (!si->si_slots[i].sl_valid ||
- !si->si_slots[i].sl_node_num) {
+ if (!si->si_slots[i].sl_valid) {
ret = i;
break;
}
@@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- if (ocfs2_mount_local(osb))
- /* use slot 0 directly in local mode */
- slot = 0;
- else {
- /* search for ourselves first and take the slot if it already
- * exists. Perhaps we need to mark this in a variable for our
- * own journal recovery? Possibly not, though we certainly
- * need to warn to the user */
- slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ /* search for ourselves first and take the slot if it already
+ * exists. Perhaps we need to mark this in a variable for our
+ * own journal recovery? Possibly not, though we certainly
+ * need to warn to the user */
+ slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ if (slot < 0) {
+ /* if no slot yet, then just take 1st available
+ * one. */
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot < 0) {
- /* if no slot yet, then just take 1st available
- * one. */
- slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
- if (slot < 0) {
- spin_unlock(&osb->osb_lock);
- mlog(ML_ERROR, "no free slots available!\n");
- status = -EINVAL;
- goto bail;
- }
- } else
- printk(KERN_INFO "ocfs2: Slot %d on device (%s) was "
- "already allocated to this node!\n",
- slot, osb->dev_str);
- }
+ spin_unlock(&osb->osb_lock);
+ mlog(ML_ERROR, "no free slots available!\n");
+ status = -EINVAL;
+ goto bail;
+ }
+ } else
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a75e2b7d67f5..64e6ddcfe329 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -991,7 +991,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
lc->oc_type = NO_CONTROLD;
rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
- DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
+ DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
&ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
if (rc) {
if (rc == -EEXIST || rc == -EPROTO)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 477cdf94122e..e2cc9eec287c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -172,7 +172,6 @@ enum {
Opt_dir_resv_level,
Opt_journal_async_commit,
Opt_err_cont,
- Opt_nocluster,
Opt_err,
};
@@ -206,7 +205,6 @@ static const match_table_t tokens = {
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_err_cont, "errors=continue"},
- {Opt_nocluster, "nocluster"},
{Opt_err, NULL}
};
@@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
- tmp = OCFS2_MOUNT_NOCLUSTER;
- if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
- ret = -EINVAL;
- mlog(ML_ERROR, "Cannot change nocluster option on remount\n");
- goto out;
- }
-
tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
OCFS2_MOUNT_HB_NONE;
if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
@@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
}
if (ocfs2_userspace_stack(osb) &&
- !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
OCFS2_STACK_LABEL_LEN)) {
mlog(ML_ERROR,
@@ -989,28 +979,27 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
status = -EINVAL;
- goto read_super_error;
+ goto out;
}
/* probe for superblock */
status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
if (status < 0) {
mlog(ML_ERROR, "superblock probe failed!\n");
- goto read_super_error;
+ goto out;
}
status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
- osb = OCFS2_SB(sb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
brelse(bh);
bh = NULL;
+ if (status < 0)
+ goto out;
+
+ osb = OCFS2_SB(sb);
if (!ocfs2_check_set_options(sb, &parsed_options)) {
status = -EINVAL;
- goto read_super_error;
+ goto out_super;
}
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
@@ -1027,7 +1016,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
- goto read_super_error;
+ goto out_super;
sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -1041,7 +1030,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -EACCES;
mlog(ML_ERROR, "Readonly device detected but readonly "
"mount was not specified.\n");
- goto read_super_error;
+ goto out_super;
}
/* You should not be able to start a local heartbeat
@@ -1050,7 +1039,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -EROFS;
mlog(ML_ERROR, "Local heartbeat specified on readonly "
"device.\n");
- goto read_super_error;
+ goto out_super;
}
status = ocfs2_check_journals_nolocks(osb);
@@ -1059,9 +1048,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
mlog(ML_ERROR, "Recovery required on readonly "
"file system, but write access is "
"unavailable.\n");
- else
- mlog_errno(status);
- goto read_super_error;
+ goto out_super;
}
ocfs2_set_ro_flag(osb, 1);
@@ -1077,10 +1064,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
status = ocfs2_verify_heartbeat(osb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
+ if (status < 0)
+ goto out_super;
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
@@ -1094,15 +1079,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = ocfs2_mount_volume(sb);
if (status < 0)
- goto read_super_error;
+ goto out_debugfs;
if (osb->root_inode)
inode = igrab(osb->root_inode);
if (!inode) {
status = -EIO;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
}
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
@@ -1110,7 +1094,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
if (!osb->osb_dev_kset) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id);
- goto read_super_error;
+ goto out_dismount;
}
/* Create filecheck sysfs related directories/files at
@@ -1119,14 +1103,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create filecheck sysfs directory at "
"/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id);
- goto read_super_error;
+ goto out_dismount;
}
root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
}
sb->s_root = root;
@@ -1144,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
"ordered");
- if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
- !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT))
- printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted "
- "without cluster aware mode.\n", osb->dev_str);
-
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
@@ -1178,17 +1156,21 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
return status;
-read_super_error:
- brelse(bh);
-
- if (status)
- mlog_errno(status);
+out_dismount:
+ atomic_set(&osb->vol_state, VOLUME_DISABLED);
+ wake_up(&osb->osb_mount_event);
+ ocfs2_dismount_volume(sb, 1);
+ goto out;
- if (osb) {
- atomic_set(&osb->vol_state, VOLUME_DISABLED);
- wake_up(&osb->osb_mount_event);
- ocfs2_dismount_volume(sb, 1);
- }
+out_debugfs:
+ debugfs_remove_recursive(osb->osb_debug_root);
+out_super:
+ ocfs2_release_system_inodes(osb);
+ kfree(osb->recovery_map);
+ ocfs2_delete_osb(osb);
+ kfree(osb);
+out:
+ mlog_errno(status);
return status;
}
@@ -1455,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_journal_async_commit:
mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
break;
- case Opt_nocluster:
- mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER;
- break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1569,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
seq_printf(s, ",journal_async_commit");
- if (opts & OCFS2_MOUNT_NOCLUSTER)
- seq_printf(s, ",nocluster");
-
return 0;
}
@@ -1788,7 +1764,7 @@ static int ocfs2_get_sector(struct super_block *sb,
if (!buffer_dirty(*bh))
clear_buffer_uptodate(*bh);
unlock_buffer(*bh);
- ll_rw_block(REQ_OP_READ, 0, 1, bh);
+ ll_rw_block(REQ_OP_READ, 1, bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
mlog_errno(-EIO);
@@ -1803,11 +1779,10 @@ static int ocfs2_get_sector(struct super_block *sb,
static int ocfs2_mount_volume(struct super_block *sb)
{
int status = 0;
- int unlock_super = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
if (ocfs2_is_hard_readonly(osb))
- goto leave;
+ goto out;
mutex_init(&osb->obs_trim_fs_mutex);
@@ -1817,44 +1792,56 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (status == -EBADR && ocfs2_userspace_stack(osb))
mlog(ML_ERROR, "couldn't mount because cluster name on"
" disk does not match the running cluster name.\n");
- goto leave;
+ goto out;
}
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_dlm;
}
- unlock_super = 1;
/* This will load up the node map and add ourselves to it. */
status = ocfs2_find_slot(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
status = ocfs2_check_volume(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_system_inodes;
}
status = ocfs2_truncate_log_init(osb);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ goto out_system_inodes;
+ }
-leave:
- if (unlock_super)
- ocfs2_super_unlock(osb, 1);
+ ocfs2_super_unlock(osb, 1);
+ return 0;
+out_system_inodes:
+ if (osb->local_alloc_state == OCFS2_LA_ENABLED)
+ ocfs2_shutdown_local_alloc(osb);
+ ocfs2_release_system_inodes(osb);
+ /* before journal shutdown, we should release slot_info */
+ ocfs2_free_slot_info(osb);
+ ocfs2_journal_shutdown(osb);
+out_super_lock:
+ ocfs2_super_unlock(osb, 1);
+out_dlm:
+ ocfs2_dlm_shutdown(osb, 0);
+out:
return status;
}
@@ -1927,8 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
!ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
- if (osb->cconn)
- ocfs2_dlm_shutdown(osb, hangup_needed);
+ ocfs2_dlm_shutdown(osb, hangup_needed);
ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
debugfs_remove_recursive(osb->osb_debug_root);
@@ -2022,7 +2008,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out;
}
sb->s_fs_info = osb;
@@ -2083,7 +2069,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
osb->max_slots);
status = -EINVAL;
- goto bail;
+ goto out;
}
ocfs2_orphan_scan_init(osb);
@@ -2092,7 +2078,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (status) {
mlog(ML_ERROR, "Unable to initialize recovery state\n");
mlog_errno(status);
- goto bail;
+ goto out;
}
init_waitqueue_head(&osb->checkpoint_event);
@@ -2110,17 +2096,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
init_waitqueue_head(&osb->osb_mount_event);
- status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_resmap_init(osb, &osb->osb_la_resmap);
osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
if (!osb->vol_label) {
mlog(ML_ERROR, "unable to alloc vol label\n");
status = -ENOMEM;
- goto bail;
+ goto out_recovery_map;
}
osb->slot_recovery_generations =
@@ -2129,7 +2111,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->slot_recovery_generations) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_vol_label;
}
init_waitqueue_head(&osb->osb_wipe_event);
@@ -2139,7 +2121,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->osb_orphan_wipes) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_slot_recovery_gen;
}
osb->osb_rf_lock_tree = RB_ROOT;
@@ -2155,13 +2137,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "couldn't mount because of unsupported "
"optional features (%x).\n", i);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
mlog(ML_ERROR, "couldn't mount RDWR because of "
"unsupported optional features (%x).\n", i);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
if (ocfs2_clusterinfo_valid(osb)) {
@@ -2182,7 +2164,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
"cluster stack label (%s) \n",
osb->osb_cluster_stack);
status = -EINVAL;
- goto bail;
+ goto out_orphan_wipes;
}
memcpy(osb->osb_cluster_name,
OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
@@ -2195,6 +2177,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
get_random_bytes(&osb->s_next_generation, sizeof(u32));
+ /*
+ * FIXME
+ * This should be done in ocfs2_journal_init(), but any inode
+ * writes back operation will cause the filesystem to crash.
+ */
+ status = ocfs2_journal_alloc(osb);
+ if (status < 0)
+ goto out_orphan_wipes;
+
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
init_llist_head(&osb->dquot_drop_list);
@@ -2208,7 +2199,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
osb->s_clustersize);
status = -EINVAL;
- goto bail;
+ goto out_journal;
}
total_blocks = ocfs2_clusters_to_blocks(osb->sb,
@@ -2220,14 +2211,14 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog(ML_ERROR, "Volume too large "
"to mount safely on this system");
status = -EFBIG;
- goto bail;
+ goto out_journal;
}
if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
sizeof(di->id2.i_super.s_uuid))) {
mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
status = -ENOMEM;
- goto bail;
+ goto out_journal;
}
strlcpy(osb->vol_label, di->id2.i_super.s_label,
@@ -2247,7 +2238,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!osb->osb_dlm_debug) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ goto out_uuid_str;
}
atomic_set(&osb->vol_state, VOLUME_INIT);
@@ -2256,7 +2247,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = ocfs2_init_global_system_inodes(osb);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_dlm_out;
}
/*
@@ -2267,7 +2258,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (!inode) {
status = -EINVAL;
mlog_errno(status);
- goto bail;
+ goto out_system_inodes;
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
@@ -2280,16 +2271,39 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = ocfs2_init_slot_info(osb);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto out_system_inodes;
}
osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM);
if (!osb->ocfs2_wq) {
status = -ENOMEM;
mlog_errno(status);
+ goto out_slot_info;
}
-bail:
+ return status;
+
+out_slot_info:
+ ocfs2_free_slot_info(osb);
+out_system_inodes:
+ ocfs2_release_system_inodes(osb);
+out_dlm_out:
+ ocfs2_put_dlm_debug(osb->osb_dlm_debug);
+out_uuid_str:
+ kfree(osb->uuid_str);
+out_journal:
+ kfree(osb->journal);
+out_orphan_wipes:
+ kfree(osb->osb_orphan_wipes);
+out_slot_recovery_gen:
+ kfree(osb->slot_recovery_generations);
+out_vol_label:
+ kfree(osb->vol_label);
+out_recovery_map:
+ kfree(osb->recovery_map);
+out:
+ kfree(osb);
+ sb->s_fs_info = NULL;
return status;
}
@@ -2483,6 +2497,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
kfree(osb->osb_orphan_wipes);
kfree(osb->slot_recovery_generations);
+ /* FIXME
+ * This belongs in journal shutdown, but because we have to
+ * allocate osb->journal at the middle of ocfs2_initialize_super(),
+ * we free it here.
+ */
+ kfree(osb->journal);
kfree(osb->local_alloc_copy);
kfree(osb->uuid_str);
kfree(osb->vol_label);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f755a4985821..d4c5fdcfa1e4 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -52,8 +52,9 @@
#include "buffer_head_io.h"
-static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
+static int ocfs2_fast_symlink_read_folio(struct file *f, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct buffer_head *bh = NULL;
int status = ocfs2_read_inode_block(inode, &bh);
@@ -81,7 +82,7 @@ static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
}
const struct address_space_operations ocfs2_fast_symlink_aops = {
- .readpage = ocfs2_fast_symlink_readpage,
+ .read_folio = ocfs2_fast_symlink_read_folio,
};
const struct inode_operations ocfs2_symlink_inode_operations = {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 3f297b541713..fa7fe2393ff6 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -284,9 +284,9 @@ out:
return ret;
}
-static int omfs_readpage(struct file *file, struct page *page)
+static int omfs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page, omfs_get_block);
+ return block_read_full_folio(folio, omfs_get_block);
}
static void omfs_readahead(struct readahead_control *rac)
@@ -316,13 +316,12 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
}
static int omfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- omfs_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, omfs_get_block);
if (unlikely(ret))
omfs_write_failed(mapping, pos + len);
@@ -374,7 +373,7 @@ const struct inode_operations omfs_file_inops = {
const struct address_space_operations omfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = omfs_readpage,
+ .read_folio = omfs_read_folio,
.readahead = omfs_readahead,
.writepage = omfs_writepage,
.writepages = omfs_writepages,
diff --git a/fs/open.c b/fs/open.c
index 1315253e0247..a81319b6177f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -224,6 +224,21 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
}
#endif /* BITS_PER_LONG == 32 */
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
+COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
+ compat_arg_u64_dual(length))
+{
+ return ksys_truncate(pathname, compat_arg_u64_glue(length));
+}
+#endif
+
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
+COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
+ compat_arg_u64_dual(length))
+{
+ return ksys_ftruncate(fd, compat_arg_u64_glue(length));
+}
+#endif
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
@@ -339,6 +354,15 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
return ksys_fallocate(fd, mode, offset, len);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
+COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
+ compat_arg_u64_dual(len))
+{
+ return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
+ compat_arg_u64_glue(len));
+}
+#endif
+
/*
* access() needs to use the real uid/gid, not the effective uid/gid.
* We do this by temporarily clearing all FS-related capabilities and
@@ -639,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
return do_fchmodat(AT_FDCWD, filename, mode);
}
+/**
+ * setattr_vfsuid - check and set ia_fsuid attribute
+ * @kuid: new inode owner
+ *
+ * Check whether @kuid is valid and if so generate and set vfsuid_t in
+ * ia_vfsuid.
+ *
+ * Return: true if @kuid is valid, false if not.
+ */
+static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
+{
+ if (!uid_valid(kuid))
+ return false;
+ attr->ia_valid |= ATTR_UID;
+ attr->ia_vfsuid = VFSUIDT_INIT(kuid);
+ return true;
+}
+
+/**
+ * setattr_vfsgid - check and set ia_fsgid attribute
+ * @kgid: new inode owner
+ *
+ * Check whether @kgid is valid and if so generate and set vfsgid_t in
+ * ia_vfsgid.
+ *
+ * Return: true if @kgid is valid, false if not.
+ */
+static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
+{
+ if (!gid_valid(kgid))
+ return false;
+ attr->ia_valid |= ATTR_GID;
+ attr->ia_vfsgid = VFSGIDT_INIT(kgid);
+ return true;
+}
+
int chown_common(const struct path *path, uid_t user, gid_t group)
{
struct user_namespace *mnt_userns, *fs_userns;
@@ -654,28 +714,24 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
mnt_userns = mnt_user_ns(path->mnt);
fs_userns = i_user_ns(inode);
- uid = mapped_kuid_user(mnt_userns, fs_userns, uid);
- gid = mapped_kgid_user(mnt_userns, fs_userns, gid);
retry_deleg:
+ newattrs.ia_vfsuid = INVALID_VFSUID;
+ newattrs.ia_vfsgid = INVALID_VFSGID;
newattrs.ia_valid = ATTR_CTIME;
- if (user != (uid_t) -1) {
- if (!uid_valid(uid))
- return -EINVAL;
- newattrs.ia_valid |= ATTR_UID;
- newattrs.ia_uid = uid;
- }
- if (group != (gid_t) -1) {
- if (!gid_valid(gid))
- return -EINVAL;
- newattrs.ia_valid |= ATTR_GID;
- newattrs.ia_gid = gid;
- }
+ if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
+ return -EINVAL;
+ if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
+ return -EINVAL;
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
- error = security_path_chown(path, uid, gid);
+ /* Continue to send actual fs values, not the mount values. */
+ error = security_path_chown(
+ path,
+ from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid),
+ from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid));
if (!error)
error = notify_change(mnt_userns, path->dentry, &newattrs,
&delegated_inode);
@@ -786,7 +842,9 @@ static int do_dentry_open(struct file *f,
return 0;
}
- if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+ if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
+ i_readcount_inc(inode);
+ } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
@@ -826,24 +884,24 @@ static int do_dentry_open(struct file *f,
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
- if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
- i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
+ if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
+ f->f_mode &= ~FMODE_LSEEK;
+ if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
+ f->f_mode |= FMODE_CAN_ODIRECT;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+ f->f_iocb_flags = iocb_flags(f);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
- /* NB: we're sure to have correct a_ops only after f_op->open */
- if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
/*
* XXX: Huge page cache doesn't support writing yet. Drop all page
@@ -879,10 +937,7 @@ cleanup_all:
if (WARN_ON_ONCE(error > 0))
error = -EINVAL;
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITER) {
- put_write_access(inode);
- __mnt_drop_write(f->f_path.mnt);
- }
+ put_file_access(f);
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
@@ -981,6 +1036,48 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
+/**
+ * dentry_create - Create and open a file
+ * @path: path to create
+ * @flags: O_ flags
+ * @mode: mode bits for new file
+ * @cred: credentials to use
+ *
+ * Caller must hold the parent directory's lock, and have prepared
+ * a negative dentry, placed in @path->dentry, for the new file.
+ *
+ * Caller sets @path->mnt to the vfsmount of the filesystem where
+ * the new file is to be created. The parent directory and the
+ * negative dentry must reside on the same filesystem instance.
+ *
+ * On success, returns a "struct file *". Otherwise a ERR_PTR
+ * is returned.
+ */
+struct file *dentry_create(const struct path *path, int flags, umode_t mode,
+ const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ validate_creds(cred);
+ f = alloc_empty_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ error = vfs_create(mnt_user_ns(path->mnt),
+ d_inode(path->dentry->d_parent),
+ path->dentry, mode, true);
+ if (!error)
+ error = vfs_open(path, f);
+
+ if (unlikely(error)) {
+ fput(f);
+ return ERR_PTR(error);
+ }
+ return f;
+}
+EXPORT_SYMBOL(dentry_create);
+
struct file *open_with_fake_path(const struct path *path, int flags,
struct inode *inode, const struct cred *cred)
{
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 86810e5d7914..732661aa2680 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -417,9 +417,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
* readahead cache (if any); this forces an expensive refresh of
* data for the next caller of mmap (or 'get_block' accesses)
*/
- if (file_inode(file) &&
- file_inode(file)->i_mapping &&
- mapping_nrpages(&file_inode(file)->i_data)) {
+ if (mapping_nrpages(file->f_mapping)) {
if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
gossip_debug(GOSSIP_INODE_DEBUG,
"calling flush_racache on %pU\n",
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 79c1025d18ea..7a8c0c6e698d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -288,47 +288,43 @@ static void orangefs_readahead(struct readahead_control *rac)
}
}
-static int orangefs_readpage(struct file *file, struct page *page)
+static int orangefs_read_folio(struct file *file, struct folio *folio)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct iov_iter iter;
struct bio_vec bv;
ssize_t ret;
- loff_t off; /* offset into this page */
+ loff_t off; /* offset of this folio in the file */
if (folio_test_dirty(folio))
orangefs_launder_folio(folio);
- off = page_offset(page);
- bv.bv_page = page;
- bv.bv_len = PAGE_SIZE;
+ off = folio_pos(folio);
+ bv.bv_page = &folio->page;
+ bv.bv_len = folio_size(folio);
bv.bv_offset = 0;
- iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
+ iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio));
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
- PAGE_SIZE, inode->i_size, NULL, NULL, file);
- /* this will only zero remaining unread portions of the page data */
+ folio_size(folio), inode->i_size, NULL, NULL, file);
+ /* this will only zero remaining unread portions of the folio data */
iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
if (ret < 0) {
- SetPageError(page);
+ folio_set_error(folio);
} else {
- SetPageUptodate(page);
- if (PageError(page))
- ClearPageError(page);
+ folio_mark_uptodate(folio);
ret = 0;
}
- /* unlock the page after the ->readpage() routine completes */
- unlock_page(page);
+ /* unlock the folio after the ->read_folio() routine completes */
+ folio_unlock(folio);
return ret;
}
static int orangefs_write_begin(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags, struct page **pagep,
- void **fsdata)
+ struct address_space *mapping, loff_t pos, unsigned len,
+ struct page **pagep, void **fsdata)
{
struct orangefs_write_range *wr;
struct folio *folio;
@@ -338,7 +334,7 @@ static int orangefs_write_begin(struct file *file,
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -487,14 +483,14 @@ static void orangefs_invalidate_folio(struct folio *folio,
orangefs_launder_folio(folio);
}
-static int orangefs_releasepage(struct page *page, gfp_t foo)
+static bool orangefs_release_folio(struct folio *folio, gfp_t foo)
{
- return !PagePrivate(page);
+ return !folio_test_private(folio);
}
-static void orangefs_freepage(struct page *page)
+static void orangefs_free_folio(struct folio *folio)
{
- kfree(detach_page_private(page));
+ kfree(folio_detach_private(folio));
}
static int orangefs_launder_folio(struct folio *folio)
@@ -632,14 +628,14 @@ out:
static const struct address_space_operations orangefs_address_operations = {
.writepage = orangefs_writepage,
.readahead = orangefs_readahead,
- .readpage = orangefs_readpage,
+ .read_folio = orangefs_read_folio,
.writepages = orangefs_writepages,
.dirty_folio = filemap_dirty_folio,
.write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
.invalidate_folio = orangefs_invalidate_folio,
- .releasepage = orangefs_releasepage,
- .freepage = orangefs_freepage,
+ .release_folio = orangefs_release_folio,
+ .free_folio = orangefs_free_folio,
.launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index e040970408d4..3ffea291c410 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -44,9 +44,9 @@ static bool ovl_must_copy_xattr(const char *name)
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
}
-int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
- struct dentry *new)
+int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new)
{
+ struct dentry *old = oldpath->dentry;
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int error = 0;
@@ -94,9 +94,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
continue; /* Discard */
}
retry:
- size = vfs_getxattr(&init_user_ns, old, name, value, value_size);
+ size = ovl_do_getxattr(oldpath, name, value, value_size);
if (size == -ERANGE)
- size = vfs_getxattr(&init_user_ns, old, name, NULL, 0);
+ size = ovl_do_getxattr(oldpath, name, NULL, 0);
if (size < 0) {
error = size;
@@ -117,7 +117,7 @@ retry:
goto retry;
}
- error = vfs_setxattr(&init_user_ns, new, name, value, size, 0);
+ error = ovl_do_setxattr(OVL_FS(sb), new, name, value, size, 0);
if (error) {
if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
break;
@@ -132,8 +132,8 @@ out:
return error;
}
-static int ovl_copy_fileattr(struct inode *inode, struct path *old,
- struct path *new)
+static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
+ const struct path *new)
{
struct fileattr oldfa = { .flags_valid = true };
struct fileattr newfa = { .flags_valid = true };
@@ -193,8 +193,8 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
return ovl_real_fileattr_set(new, &newfa);
}
-static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
- struct path *new, loff_t len)
+static int ovl_copy_up_data(struct ovl_fs *ofs, const struct path *old,
+ const struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
@@ -226,8 +226,7 @@ static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old,
/* Couldn't clone, so now we try to copy the data */
/* Check if lower fs supports seek operation */
- if (old_file->f_mode & FMODE_LSEEK &&
- old_file->f_op->llseek)
+ if (old_file->f_mode & FMODE_LSEEK)
skip_hole = true;
while (len) {
@@ -292,17 +291,19 @@ out_fput:
return error;
}
-static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat)
+static int ovl_set_size(struct ovl_fs *ofs,
+ struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid = ATTR_SIZE,
.ia_size = stat->size,
};
- return notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ return ovl_do_notify_change(ofs, upperdentry, &attr);
}
-static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
+ struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
@@ -311,10 +312,11 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
.ia_mtime = stat->mtime,
};
- return notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ return ovl_do_notify_change(ofs, upperdentry, &attr);
}
-int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
+ struct kstat *stat)
{
int err = 0;
@@ -323,18 +325,18 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
- err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ err = ovl_do_notify_change(ofs, upperdentry, &attr);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = stat->uid,
- .ia_gid = stat->gid,
+ .ia_vfsuid = VFSUIDT_INIT(stat->uid),
+ .ia_vfsgid = VFSGIDT_INIT(stat->gid),
};
- err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
+ err = ovl_do_notify_change(ofs, upperdentry, &attr);
}
if (!err)
- ovl_set_timestamps(upperdentry, stat);
+ ovl_set_timestamps(ofs, upperdentry, stat);
return err;
}
@@ -433,7 +435,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
if (IS_ERR(fh))
return PTR_ERR(fh);
- err = ovl_do_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len);
+ err = ovl_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len);
kfree(fh);
return err;
@@ -474,7 +476,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (err)
return err;
- temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0));
+ temp = ovl_create_temp(ofs, indexdir, OVL_CATTR(S_IFDIR | 0));
err = PTR_ERR(temp);
if (IS_ERR(temp))
goto free_name;
@@ -483,16 +485,16 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (err)
goto out;
- index = lookup_one_len(name.name, indexdir, name.len);
+ index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
} else {
- err = ovl_do_rename(dir, temp, dir, index, 0);
+ err = ovl_do_rename(ofs, dir, temp, dir, index, 0);
dput(index);
}
out:
if (err)
- ovl_cleanup(dir, temp);
+ ovl_cleanup(ofs, dir, temp);
dput(temp);
free_name:
kfree(name.name);
@@ -519,6 +521,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
int err;
struct dentry *upper;
struct dentry *upperdir = ovl_dentry_upper(c->parent);
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(upperdir);
/* Mark parent "impure" because it may now contain non-pure upper */
@@ -531,16 +534,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
return err;
inode_lock_nested(udir, I_MUTEX_PARENT);
- upper = lookup_one_len(c->dentry->d_name.name, upperdir,
- c->dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
+ c->dentry->d_name.len);
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
- err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper);
+ err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper);
dput(upper);
if (!err) {
/* Restore timestamps on parent (best effort) */
- ovl_set_timestamps(upperdir, &c->pstat);
+ ovl_set_timestamps(ofs, upperdir, &c->pstat);
ovl_dentry_set_upper_alias(c->dentry);
}
}
@@ -578,7 +581,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
return err;
}
- err = ovl_copy_xattr(c->dentry->d_sb, c->lowerpath.dentry, temp);
+ err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp);
if (err)
return err;
@@ -614,9 +617,9 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
inode_lock(temp->d_inode);
if (S_ISREG(c->stat.mode))
- err = ovl_set_size(temp, &c->stat);
+ err = ovl_set_size(ofs, temp, &c->stat);
if (!err)
- err = ovl_set_attr(temp, &c->stat);
+ err = ovl_set_attr(ofs, temp, &c->stat);
inode_unlock(temp->d_inode);
return err;
@@ -656,6 +659,7 @@ static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
*/
static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *inode;
struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
struct dentry *temp, *upper;
@@ -677,7 +681,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
if (err)
goto unlock;
- temp = ovl_create_temp(c->workdir, &cattr);
+ temp = ovl_create_temp(ofs, c->workdir, &cattr);
ovl_revert_cu_creds(&cc);
err = PTR_ERR(temp);
@@ -694,12 +698,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
goto cleanup;
}
- upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
+ c->destname.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto cleanup;
- err = ovl_do_rename(wdir, temp, udir, upper, 0);
+ err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0);
dput(upper);
if (err)
goto cleanup;
@@ -716,7 +721,7 @@ unlock:
return err;
cleanup:
- ovl_cleanup(wdir, temp);
+ ovl_cleanup(ofs, wdir, temp);
dput(temp);
goto unlock;
}
@@ -724,6 +729,7 @@ cleanup:
/* Copyup using O_TMPFILE which does not require cross dir locking */
static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
{
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *udir = d_inode(c->destdir);
struct dentry *temp, *upper;
struct ovl_cu_creds cc;
@@ -733,7 +739,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
return err;
- temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
+ temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
ovl_revert_cu_creds(&cc);
if (IS_ERR(temp))
@@ -745,10 +751,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
inode_lock_nested(udir, I_MUTEX_PARENT);
- upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
+ c->destname.len);
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
- err = ovl_do_link(temp, udir, upper);
+ err = ovl_do_link(ofs, temp, udir, upper);
dput(upper);
}
inode_unlock(udir);
@@ -836,7 +843,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
/* Restore timestamps on parent (best effort) */
inode_lock(udir);
- ovl_set_timestamps(c->destdir, &c->pstat);
+ ovl_set_timestamps(ofs, c->destdir, &c->pstat);
inode_unlock(udir);
ovl_dentry_set_upper_alias(c->dentry);
@@ -865,12 +872,12 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
return true;
}
-static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
+static ssize_t ovl_getxattr_value(const struct path *path, char *name, char **value)
{
ssize_t res;
char *buf;
- res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
+ res = ovl_do_getxattr(path, name, NULL, 0);
if (res == -ENODATA || res == -EOPNOTSUPP)
res = 0;
@@ -879,7 +886,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value)
if (!buf)
return -ENOMEM;
- res = vfs_getxattr(&init_user_ns, dentry, name, buf, res);
+ res = ovl_do_getxattr(path, name, buf, res);
if (res < 0)
kfree(buf);
else
@@ -906,8 +913,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
return -EIO;
if (c->stat.size) {
- err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
- &capability);
+ err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS,
+ &capability);
if (cap_size < 0)
goto out;
}
@@ -921,14 +928,14 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
* don't want that to happen for normal copy-up operation.
*/
if (capability) {
- err = vfs_setxattr(&init_user_ns, upperpath.dentry,
- XATTR_NAME_CAPS, capability, cap_size, 0);
+ err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS,
+ capability, cap_size, 0);
if (err)
goto out_free;
}
- err = ovl_do_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
+ err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
if (err)
goto out_free;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index f18490813170..6b03457f72bb 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -23,15 +23,15 @@ MODULE_PARM_DESC(redirect_max,
static int ovl_set_redirect(struct dentry *dentry, bool samedir);
-int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
- err = ovl_do_rmdir(wdir, wdentry);
+ err = ovl_do_rmdir(ofs, wdir, wdentry);
else
- err = ovl_do_unlink(wdir, wdentry);
+ err = ovl_do_unlink(ofs, wdir, wdentry);
dput(wdentry);
if (err) {
@@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
return err;
}
-struct dentry *ovl_lookup_temp(struct dentry *workdir)
+struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
{
struct dentry *temp;
char name[20];
@@ -51,7 +51,7 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir)
/* counter is allowed to wrap, since temp dentries are ephemeral */
snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id));
- temp = lookup_one_len(name, workdir, strlen(name));
+ temp = ovl_lookup_upper(ofs, name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("workdir/%s already exists\n", name);
dput(temp);
@@ -70,11 +70,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
struct inode *wdir = workdir->d_inode;
if (!ofs->whiteout) {
- whiteout = ovl_lookup_temp(workdir);
+ whiteout = ovl_lookup_temp(ofs, workdir);
if (IS_ERR(whiteout))
goto out;
- err = ovl_do_whiteout(wdir, whiteout);
+ err = ovl_do_whiteout(ofs, wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
@@ -84,11 +84,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
}
if (ofs->share_whiteout) {
- whiteout = ovl_lookup_temp(workdir);
+ whiteout = ovl_lookup_temp(ofs, workdir);
if (IS_ERR(whiteout))
goto out;
- err = ovl_do_link(ofs->whiteout, wdir, whiteout);
+ err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
if (!err)
goto out;
@@ -122,27 +122,28 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
if (d_is_dir(dentry))
flags = RENAME_EXCHANGE;
- err = ovl_do_rename(wdir, whiteout, dir, dentry, flags);
+ err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags);
if (err)
goto kill_whiteout;
if (flags)
- ovl_cleanup(wdir, dentry);
+ ovl_cleanup(ofs, wdir, dentry);
out:
dput(whiteout);
return err;
kill_whiteout:
- ovl_cleanup(wdir, whiteout);
+ ovl_cleanup(ofs, wdir, whiteout);
goto out;
}
-int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
+int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry **newdentry, umode_t mode)
{
int err;
struct dentry *d, *dentry = *newdentry;
- err = ovl_do_mkdir(dir, dentry, mode);
+ err = ovl_do_mkdir(ofs, dir, dentry, mode);
if (err)
return err;
@@ -154,8 +155,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
* to it unhashed and negative. If that happens, try to
* lookup a new hashed and positive dentry.
*/
- d = lookup_one_len(dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
+ d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent,
+ dentry->d_name.len);
if (IS_ERR(d)) {
pr_warn("failed lookup after mkdir (%pd2, err=%i).\n",
dentry, err);
@@ -167,8 +168,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode)
return 0;
}
-struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
- struct ovl_cattr *attr)
+struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *newdentry, struct ovl_cattr *attr)
{
int err;
@@ -180,28 +181,28 @@ struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
goto out;
if (attr->hardlink) {
- err = ovl_do_link(attr->hardlink, dir, newdentry);
+ err = ovl_do_link(ofs, attr->hardlink, dir, newdentry);
} else {
switch (attr->mode & S_IFMT) {
case S_IFREG:
- err = ovl_do_create(dir, newdentry, attr->mode);
+ err = ovl_do_create(ofs, dir, newdentry, attr->mode);
break;
case S_IFDIR:
/* mkdir is special... */
- err = ovl_mkdir_real(dir, &newdentry, attr->mode);
+ err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- err = ovl_do_mknod(dir, newdentry, attr->mode,
+ err = ovl_do_mknod(ofs, dir, newdentry, attr->mode,
attr->rdev);
break;
case S_IFLNK:
- err = ovl_do_symlink(dir, newdentry, attr->link);
+ err = ovl_do_symlink(ofs, dir, newdentry, attr->link);
break;
default:
@@ -223,10 +224,11 @@ out:
return newdentry;
}
-struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr)
+struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
+ struct ovl_cattr *attr)
{
- return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir),
- attr);
+ return ovl_create_real(ofs, d_inode(workdir),
+ ovl_lookup_temp(ofs, workdir), attr);
}
static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
@@ -330,10 +332,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
attr->mode &= ~current_umask();
inode_lock_nested(udir, I_MUTEX_PARENT);
- newdentry = ovl_create_real(udir,
- lookup_one_len(dentry->d_name.name,
- upperdir,
- dentry->d_name.len),
+ newdentry = ovl_create_real(ofs, udir,
+ ovl_lookup_upper(ofs, dentry->d_name.name,
+ upperdir, dentry->d_name.len),
attr);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
@@ -353,7 +354,7 @@ out_unlock:
return err;
out_cleanup:
- ovl_cleanup(udir, newdentry);
+ ovl_cleanup(ofs, udir, newdentry);
dput(newdentry);
goto out_unlock;
}
@@ -361,6 +362,7 @@ out_cleanup:
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
@@ -391,12 +393,12 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (upper->d_parent->d_inode != udir)
goto out_unlock;
- opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode));
+ opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode));
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
- err = ovl_copy_xattr(dentry->d_sb, upper, opaquedir);
+ err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir);
if (err)
goto out_cleanup;
@@ -405,17 +407,17 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
goto out_cleanup;
inode_lock(opaquedir->d_inode);
- err = ovl_set_attr(opaquedir, &stat);
+ err = ovl_set_attr(ofs, opaquedir, &stat);
inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
- err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
- ovl_cleanup_whiteouts(upper, list);
- ovl_cleanup(wdir, upper);
+ ovl_cleanup_whiteouts(ofs, upper, list);
+ ovl_cleanup(ofs, wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
@@ -424,7 +426,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
return opaquedir;
out_cleanup:
- ovl_cleanup(wdir, opaquedir);
+ ovl_cleanup(ofs, wdir, opaquedir);
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
@@ -432,8 +434,8 @@ out:
return ERR_PTR(err);
}
-static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
- const struct posix_acl *acl)
+static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry,
+ const char *name, const struct posix_acl *acl)
{
void *buffer;
size_t size;
@@ -451,7 +453,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
if (err < 0)
goto out_free;
- err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE);
+ err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE);
out_free:
kfree(buffer);
return err;
@@ -460,6 +462,7 @@ out_free:
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct ovl_cattr *cattr)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
@@ -484,8 +487,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out;
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
@@ -494,7 +497,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
goto out_dput;
- newdentry = ovl_create_temp(workdir, cattr);
+ newdentry = ovl_create_temp(ofs, workdir, cattr);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput;
@@ -510,19 +513,19 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
.ia_mode = cattr->mode,
};
inode_lock(newdentry->d_inode);
- err = notify_change(&init_user_ns, newdentry, &attr, NULL);
+ err = ovl_do_notify_change(ofs, newdentry, &attr);
inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
}
if (!hardlink) {
- err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS,
- acl);
+ err = ovl_set_upper_acl(ofs, newdentry,
+ XATTR_NAME_POSIX_ACL_ACCESS, acl);
if (err)
goto out_cleanup;
- err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT,
- default_acl);
+ err = ovl_set_upper_acl(ofs, newdentry,
+ XATTR_NAME_POSIX_ACL_DEFAULT, default_acl);
if (err)
goto out_cleanup;
}
@@ -532,20 +535,20 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
- err = ovl_do_rename(wdir, newdentry, udir, upper,
+ err = ovl_do_rename(ofs, wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
- ovl_cleanup(wdir, upper);
+ ovl_cleanup(ofs, wdir, upper);
} else {
- err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
err = ovl_instantiate(dentry, inode, newdentry, hardlink);
if (err) {
- ovl_cleanup(udir, newdentry);
+ ovl_cleanup(ofs, udir, newdentry);
dput(newdentry);
}
out_dput:
@@ -560,7 +563,7 @@ out:
return err;
out_cleanup:
- ovl_cleanup(wdir, newdentry);
+ ovl_cleanup(ofs, wdir, newdentry);
dput(newdentry);
goto out_dput;
}
@@ -767,8 +770,8 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
if (err)
goto out_dput;
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
@@ -800,6 +803,7 @@ out:
static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
struct list_head *list)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
@@ -814,8 +818,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
}
inode_lock_nested(dir, I_MUTEX_PARENT);
- upper = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
@@ -826,9 +830,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
goto out_dput_upper;
if (is_dir)
- err = vfs_rmdir(&init_user_ns, dir, upper);
+ err = ovl_do_rmdir(ofs, dir, upper);
else
- err = vfs_unlink(&init_user_ns, dir, upper, NULL);
+ err = ovl_do_unlink(ofs, dir, upper);
ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry));
/*
@@ -880,7 +884,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
int err;
const struct cred *old_cred;
- struct dentry *upperdentry;
bool lower_positive = ovl_lower_positive(dentry);
LIST_HEAD(list);
@@ -923,9 +926,8 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
* Note: we fail to update ctime if there was no copy-up, only a
* whiteout
*/
- upperdentry = ovl_dentry_upper(dentry);
- if (upperdentry)
- ovl_copyattr(d_inode(upperdentry), d_inode(dentry));
+ if (ovl_dentry_upper(dentry))
+ ovl_copyattr(d_inode(dentry));
out_drop_write:
ovl_drop_write(dentry);
@@ -1095,6 +1097,7 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
bool samedir = olddir == newdir;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
+ struct ovl_fs *ofs = OVL_FS(old->d_sb);
LIST_HEAD(list);
err = -EINVAL;
@@ -1189,8 +1192,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
trap = lock_rename(new_upperdir, old_upperdir);
- olddentry = lookup_one_len(old->d_name.name, old_upperdir,
- old->d_name.len);
+ olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
+ old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
@@ -1199,8 +1202,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
if (!ovl_matches_upper(old, olddentry))
goto out_dput_old;
- newdentry = lookup_one_len(new->d_name.name, new_upperdir,
- new->d_name.len);
+ newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
+ new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
@@ -1251,13 +1254,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
if (err)
goto out_dput;
- err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+ err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry, flags);
if (err)
goto out_dput;
if (cleanup_whiteout)
- ovl_cleanup(old_upperdir->d_inode, newdentry);
+ ovl_cleanup(ofs, old_upperdir->d_inode, newdentry);
if (overwrite && d_inode(new)) {
if (new_is_dir)
@@ -1272,9 +1275,9 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
(d_inode(new) && ovl_type_origin(new)));
/* copy ctime: */
- ovl_copyattr(d_inode(olddentry), d_inode(old));
+ ovl_copyattr(d_inode(old));
if (d_inode(new) && ovl_dentry_upper(new))
- ovl_copyattr(d_inode(newdentry), d_inode(new));
+ ovl_copyattr(d_inode(new));
out_dput:
dput(newdentry);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index ebde05c9cf62..e065a5b9a442 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -259,7 +259,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
return FILEID_INVALID;
dentry = d_find_any_alias(inode);
- if (WARN_ON(!dentry))
+ if (!dentry)
return FILEID_INVALID;
bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
@@ -391,6 +391,11 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
* pointer because we hold no lock on the real dentry.
*/
take_dentry_name_snapshot(&name, real);
+ /*
+ * No mnt_userns handling here: it's an internal lookup. Could skip
+ * permission checking altogether, but for now just use non-mnt_userns
+ * transformed ids.
+ */
this = lookup_one_len(name.name.name, connected, name.name.len);
release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index fa125feed0ff..a1a22f58ba18 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -38,9 +38,11 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
static struct file *ovl_open_realfile(const struct file *file,
- struct inode *realinode)
+ const struct path *realpath)
{
+ struct inode *realinode = d_inode(realpath->dentry);
struct inode *inode = file_inode(file);
+ struct user_namespace *real_mnt_userns;
struct file *realfile;
const struct cred *old_cred;
int flags = file->f_flags | OVL_OPEN_FLAGS;
@@ -51,11 +53,12 @@ static struct file *ovl_open_realfile(const struct file *file,
acc_mode |= MAY_APPEND;
old_cred = ovl_override_creds(inode->i_sb);
- err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode);
+ real_mnt_userns = mnt_user_ns(realpath->mnt);
+ err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode);
if (err) {
realfile = ERR_PTR(err);
} else {
- if (!inode_owner_or_capable(&init_user_ns, realinode))
+ if (!inode_owner_or_capable(real_mnt_userns, realinode))
flags &= ~O_NOATIME;
realfile = open_with_fake_path(&file->f_path, flags, realinode,
@@ -82,11 +85,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
return -EPERM;
- if (flags & O_DIRECT) {
- if (!file->f_mapping->a_ops ||
- !file->f_mapping->a_ops->direct_IO)
- return -EINVAL;
- }
+ if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
if (file->f_op->check_flags) {
err = file->f_op->check_flags(flags);
@@ -104,21 +104,21 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
bool allow_meta)
{
- struct inode *inode = file_inode(file);
- struct inode *realinode;
+ struct dentry *dentry = file_dentry(file);
+ struct path realpath;
real->flags = 0;
real->file = file->private_data;
if (allow_meta)
- realinode = ovl_inode_real(inode);
+ ovl_path_real(dentry, &realpath);
else
- realinode = ovl_inode_realdata(inode);
+ ovl_path_realdata(dentry, &realpath);
/* Has it been copied up since we'd opened it? */
- if (unlikely(file_inode(real->file) != realinode)) {
+ if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) {
real->flags = FDPUT_FPUT;
- real->file = ovl_open_realfile(file, realinode);
+ real->file = ovl_open_realfile(file, &realpath);
return PTR_ERR_OR_ZERO(real->file);
}
@@ -144,17 +144,20 @@ static int ovl_real_fdget(const struct file *file, struct fd *real)
static int ovl_open(struct inode *inode, struct file *file)
{
+ struct dentry *dentry = file_dentry(file);
struct file *realfile;
+ struct path realpath;
int err;
- err = ovl_maybe_copy_up(file_dentry(file), file->f_flags);
+ err = ovl_maybe_copy_up(dentry, file->f_flags);
if (err)
return err;
/* No longer need these flags, so don't pass them on to underlying fs */
file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
- realfile = ovl_open_realfile(file, ovl_inode_realdata(inode));
+ ovl_path_realdata(dentry, &realpath);
+ realfile = ovl_open_realfile(file, &realpath);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
@@ -273,7 +276,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
__sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
SB_FREEZE_WRITE);
file_end_write(iocb->ki_filp);
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
}
orig_iocb->ki_pos = iocb->ki_pos;
@@ -306,8 +309,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
@@ -356,7 +358,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
inode_lock(inode);
/* Update mode */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
ret = file_remove_privs(file);
if (ret)
goto out_unlock;
@@ -367,8 +369,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
ret = -EINVAL;
if (iocb->ki_flags & IOCB_DIRECT &&
- (!real.file->f_mapping->a_ops ||
- !real.file->f_mapping->a_ops->direct_IO))
+ !(real.file->f_mode & FMODE_CAN_ODIRECT))
goto out_fdput;
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
@@ -381,7 +382,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
ovl_iocb_to_rwf(ifl));
file_end_write(real.file);
/* Update size */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
} else {
struct ovl_aio_req *aio_req;
@@ -431,12 +432,11 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
struct fd real;
const struct cred *old_cred;
struct inode *inode = file_inode(out);
- struct inode *realinode = ovl_inode_real(inode);
ssize_t ret;
inode_lock(inode);
/* Update mode */
- ovl_copyattr(realinode, inode);
+ ovl_copyattr(inode);
ret = file_remove_privs(out);
if (ret)
goto out_unlock;
@@ -452,7 +452,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
file_end_write(real.file);
/* Update size */
- ovl_copyattr(realinode, inode);
+ ovl_copyattr(inode);
revert_creds(old_cred);
fdput(real);
@@ -526,7 +526,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
fdput(real);
@@ -598,7 +598,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
revert_creds(old_cred);
/* Update size */
- ovl_copyattr(ovl_inode_real(inode_out), inode_out);
+ ovl_copyattr(inode_out);
fdput(real_in);
fdput(real_out);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 1f36158c7dbe..9e61511de7a7 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -21,6 +21,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
{
int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
bool full_copy_up = false;
struct dentry *upperdentry;
const struct cred *old_cred;
@@ -77,10 +78,10 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
- err = notify_change(&init_user_ns, upperdentry, attr, NULL);
+ err = ovl_do_notify_change(ofs, upperdentry, attr);
revert_creds(old_cred);
if (!err)
- ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
+ ovl_copyattr(dentry->d_inode);
inode_unlock(upperdentry->d_inode);
if (winode)
@@ -279,12 +280,14 @@ int ovl_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
struct inode *upperinode = ovl_inode_upper(inode);
- struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
+ struct inode *realinode;
+ struct path realpath;
const struct cred *old_cred;
int err;
/* Careful in RCU walk mode */
- if (!realinode) {
+ ovl_i_path_real(inode, &realpath);
+ if (!realpath.dentry) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
return -ECHILD;
}
@@ -297,6 +300,7 @@ int ovl_permission(struct user_namespace *mnt_userns,
if (err)
return err;
+ realinode = d_inode(realpath.dentry);
old_cred = ovl_override_creds(inode->i_sb);
if (!upperinode &&
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
@@ -304,7 +308,7 @@ int ovl_permission(struct user_namespace *mnt_userns,
/* Make sure mounter can read file for copy up later */
mask |= MAY_READ;
}
- err = inode_permission(&init_user_ns, realinode, mask);
+ err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask);
revert_creds(old_cred);
return err;
@@ -342,8 +346,10 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
int err;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+ struct path realpath;
const struct cred *old_cred;
err = ovl_want_write(dentry);
@@ -351,8 +357,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
goto out;
if (!value && !upperdentry) {
+ ovl_path_lower(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0);
+ err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0);
revert_creds(old_cred);
if (err < 0)
goto out_drop_write;
@@ -367,17 +374,17 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
}
old_cred = ovl_override_creds(dentry->d_sb);
- if (value)
- err = vfs_setxattr(&init_user_ns, realdentry, name, value, size,
- flags);
- else {
+ if (value) {
+ err = ovl_do_setxattr(ofs, realdentry, name, value, size,
+ flags);
+ } else {
WARN_ON(flags != XATTR_REPLACE);
- err = vfs_removexattr(&init_user_ns, realdentry, name);
+ err = ovl_do_removexattr(ofs, realdentry, name);
}
revert_creds(old_cred);
/* copy c/mtime */
- ovl_copyattr(d_inode(realdentry), inode);
+ ovl_copyattr(inode);
out_drop_write:
ovl_drop_write(dentry);
@@ -390,11 +397,11 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
{
ssize_t res;
const struct cred *old_cred;
- struct dentry *realdentry =
- ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
+ struct path realpath;
+ ovl_i_path_real(inode, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_getxattr(&init_user_ns, realdentry, name, value, size);
+ res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size);
revert_creds(old_cred);
return res;
}
@@ -447,24 +454,100 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
return res;
}
+#ifdef CONFIG_FS_POSIX_ACL
+/*
+ * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
+ * of the POSIX ACLs retrieved from the lower layer to this function to not
+ * alter the POSIX ACLs for the underlying filesystem.
+ */
+static void ovl_idmap_posix_acl(struct inode *realinode,
+ struct user_namespace *mnt_userns,
+ struct posix_acl *acl)
+{
+ struct user_namespace *fs_userns = i_user_ns(realinode);
+
+ for (unsigned int i = 0; i < acl->a_count; i++) {
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
+
+ struct posix_acl_entry *e = &acl->a_entries[i];
+ switch (e->e_tag) {
+ case ACL_USER:
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid);
+ e->e_uid = vfsuid_into_kuid(vfsuid);
+ break;
+ case ACL_GROUP:
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid);
+ e->e_gid = vfsgid_into_kgid(vfsgid);
+ break;
+ }
+ }
+}
+
+/*
+ * When the relevant layer is an idmapped mount we need to take the idmapping
+ * of the layer into account and translate any ACL_{GROUP,USER} values
+ * according to the idmapped mount.
+ *
+ * We cannot alter the ACLs returned from the relevant layer as that would
+ * alter the cached values filesystem wide for the lower filesystem. Instead we
+ * can clone the ACLs and then apply the relevant idmapping of the layer.
+ *
+ * This is obviously only relevant when idmapped layers are used.
+ */
struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
{
struct inode *realinode = ovl_inode_real(inode);
- const struct cred *old_cred;
- struct posix_acl *acl;
+ struct posix_acl *acl, *clone;
+ struct path realpath;
- if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
+ if (!IS_POSIXACL(realinode))
return NULL;
- if (rcu)
- return get_cached_acl_rcu(realinode, type);
+ /* Careful in RCU walk mode */
+ ovl_i_path_real(inode, &realpath);
+ if (!realpath.dentry) {
+ WARN_ON(!rcu);
+ return ERR_PTR(-ECHILD);
+ }
- old_cred = ovl_override_creds(inode->i_sb);
- acl = get_acl(realinode, type);
- revert_creds(old_cred);
+ if (rcu) {
+ acl = get_cached_acl_rcu(realinode, type);
+ } else {
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ acl = get_acl(realinode, type);
+ revert_creds(old_cred);
+ }
+ /*
+ * If there are no POSIX ACLs, or we encountered an error,
+ * or the layer isn't idmapped we don't need to do anything.
+ */
+ if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl))
+ return acl;
- return acl;
+ /*
+ * We only get here if the layer is idmapped. So drop out of RCU path
+ * walk so we can clone the ACLs. There's no need to release the ACLs
+ * since get_cached_acl_rcu() doesn't take a reference on the ACLs.
+ */
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ if (!clone)
+ clone = ERR_PTR(-ENOMEM);
+ else
+ ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone);
+ /*
+ * Since we're not in RCU path walk we always need to release the
+ * original ACLs.
+ */
+ posix_acl_release(acl);
+ return clone;
}
+#endif
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
{
@@ -505,7 +588,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Introducing security_inode_fileattr_get/set() hooks would solve this issue
* properly.
*/
-static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa,
+static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
bool set)
{
struct file *file;
@@ -527,7 +610,7 @@ static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa,
return err;
}
-int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
{
int err;
@@ -535,7 +618,7 @@ int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa)
if (err)
return err;
- return vfs_fileattr_set(&init_user_ns, realpath->dentry, fa);
+ return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa);
}
int ovl_fileattr_set(struct user_namespace *mnt_userns,
@@ -579,7 +662,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns,
inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK);
/* Update ctime */
- ovl_copyattr(ovl_inode_real(inode), inode);
+ ovl_copyattr(inode);
}
ovl_drop_write(dentry);
out:
@@ -602,7 +685,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
}
}
-int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
{
int err;
@@ -777,16 +860,19 @@ void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
unsigned long ino, int fsid)
{
struct inode *realinode;
+ struct ovl_inode *oi = OVL_I(inode);
if (oip->upperdentry)
- OVL_I(inode)->__upperdentry = oip->upperdentry;
- if (oip->lowerpath && oip->lowerpath->dentry)
- OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry));
+ oi->__upperdentry = oip->upperdentry;
+ if (oip->lowerpath && oip->lowerpath->dentry) {
+ oi->lowerpath.dentry = dget(oip->lowerpath->dentry);
+ oi->lowerpath.layer = oip->lowerpath->layer;
+ }
if (oip->lowerdata)
- OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata));
+ oi->lowerdata = igrab(d_inode(oip->lowerdata));
realinode = ovl_inode_real(inode);
- ovl_copyattr(realinode, inode);
+ ovl_copyattr(inode);
ovl_copyflags(realinode, inode);
ovl_map_ino(inode, ino, fsid);
}
@@ -871,8 +957,8 @@ static int ovl_set_nlink_common(struct dentry *dentry,
if (WARN_ON(len >= sizeof(buf)))
return -EIO;
- return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
- OVL_XATTR_NLINK, buf, len);
+ return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
+ OVL_XATTR_NLINK, buf, len);
}
int ovl_set_nlink_upper(struct dentry *dentry)
@@ -897,8 +983,8 @@ unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
return fallback;
- err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK,
- &buf, sizeof(buf) - 1);
+ err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK,
+ &buf, sizeof(buf) - 1);
if (err < 0)
goto fail;
@@ -1102,6 +1188,10 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
+ struct path realpath = {
+ .dentry = upperdentry ?: lowerdentry,
+ .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt,
+ };
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
int fsid = bylower ? lowerpath->layer->fsid : 0;
@@ -1175,7 +1265,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
/* Check for non-merge dir that may have whiteouts */
if (is_dir) {
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
- ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) {
+ ovl_path_check_origin_xattr(ofs, &realpath)) {
ovl_set_flag(OVL_WHITEOUTS, inode);
}
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 1a9b515fc45d..0fd1d5fdfc72 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -16,6 +16,7 @@
struct ovl_lookup_data {
struct super_block *sb;
+ struct vfsmount *mnt;
struct qstr name;
bool is_dir;
bool opaque;
@@ -25,14 +26,14 @@ struct ovl_lookup_data {
bool metacopy;
};
-static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
+static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d,
size_t prelen, const char *post)
{
int res;
char *buf;
struct ovl_fs *ofs = OVL_FS(d->sb);
- buf = ovl_get_redirect_xattr(ofs, dentry, prelen + strlen(post));
+ buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post));
if (IS_ERR_OR_NULL(buf))
return PTR_ERR(buf);
@@ -41,7 +42,7 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
* One of the ancestor path elements in an absolute path
* lookup in ovl_lookup_layer() could have been opaque and
* that will stop further lookup in lower layers (d->stop=true)
- * But we have found an absolute redirect in decendant path
+ * But we have found an absolute redirect in descendant path
* element and that should force continue lookup in lower
* layers (reset d->stop).
*/
@@ -105,13 +106,13 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len)
return 0;
}
-static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry,
+static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry,
enum ovl_xattr ox)
{
int res, err;
struct ovl_fh *fh = NULL;
- res = ovl_do_getxattr(ofs, dentry, ox, NULL, 0);
+ res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return NULL;
@@ -125,7 +126,7 @@ static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry,
if (!fh)
return ERR_PTR(-ENOMEM);
- res = ovl_do_getxattr(ofs, dentry, ox, fh->buf, res);
+ res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res);
if (res < 0)
goto fail;
@@ -193,16 +194,17 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct super_block *sb, struct dentry *dentry)
+static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
{
- return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_OPAQUE);
+ return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
}
-static struct dentry *ovl_lookup_positive_unlocked(const char *name,
+static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
+ const char *name,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_len_unlocked(name, base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -224,10 +226,11 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
struct dentry **ret, bool drop_negative)
{
struct dentry *this;
+ struct path path;
int err;
bool last_element = !post[0];
- this = ovl_lookup_positive_unlocked(name, base, namelen, drop_negative);
+ this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
@@ -253,12 +256,15 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
+
+ path.dentry = this;
+ path.mnt = d->mnt;
if (!d_can_lookup(this)) {
if (d->is_dir || !last_element) {
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(OVL_FS(d->sb), this);
+ err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path);
if (err < 0)
goto out_err;
@@ -278,14 +284,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
if (d->last)
goto out;
- if (ovl_is_opaquedir(d->sb, this)) {
+ if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
d->stop = true;
if (last_element)
d->opaque = true;
goto out;
}
}
- err = ovl_check_redirect(this, d, prelen, post);
+ err = ovl_check_redirect(&path, d, prelen, post);
if (err)
goto out_err;
out:
@@ -464,7 +470,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
err = ovl_verify_fh(ofs, dentry, ox, fh);
if (set && err == -ENODATA)
- err = ovl_do_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
+ err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);
if (err)
goto fail;
@@ -642,7 +648,7 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
* If the index dentry for a copy up origin inode is positive, but points
* to an inode different than the upper inode, then either the upper inode
* has been copied up and not indexed or it was indexed, but since then
- * index dir was cleared. Either way, that index cannot be used to indentify
+ * index dir was cleared. Either way, that index cannot be used to identify
* the overlay inode.
*/
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
@@ -704,7 +710,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
if (err)
return ERR_PTR(err);
- index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
+ index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name,
+ ofs->indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
if (err == -ENOENT) {
@@ -856,6 +863,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
+ d.mnt = ovl_upper_mnt(ofs);
err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
if (err)
goto out;
@@ -911,6 +919,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
else
d.last = lower.layer->idx == roe->numlower;
+ d.mnt = lower.layer->mnt;
err = ovl_lookup_layer(lower.dentry, &d, &this, false);
if (err)
goto out_put;
@@ -1071,14 +1080,18 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperdentry)
ovl_dentry_set_upper_alias(dentry);
else if (index) {
- upperdentry = dget(index);
- upperredirect = ovl_get_redirect_xattr(ofs, upperdentry, 0);
+ struct path upperpath = {
+ .dentry = upperdentry = dget(index),
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
if (IS_ERR(upperredirect)) {
err = PTR_ERR(upperredirect);
upperredirect = NULL;
goto out_free_oe;
}
- err = ovl_check_metacopy_xattr(ofs, upperdentry);
+ err = ovl_check_metacopy_xattr(ofs, &upperpath);
if (err < 0)
goto out_free_oe;
uppermetacopy = err;
@@ -1163,8 +1176,8 @@ bool ovl_lower_positive(struct dentry *dentry)
struct dentry *this;
struct dentry *lowerdir = poe->lowerstack[i].dentry;
- this = lookup_positive_unlocked(name->name, lowerdir,
- name->len);
+ this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt),
+ name->name, lowerdir, name->len);
if (IS_ERR(this)) {
switch (PTR_ERR(this)) {
case -ENOENT:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 2cd5741c873b..59624521eeb2 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/uuid.h>
#include <linux/fs.h>
+#include <linux/namei.h>
#include "ovl_entry.h"
#undef pr_fmt
@@ -122,109 +123,171 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
return ovl_xattr_table[ox][ofs->config.userxattr];
}
-static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+/*
+ * When changing ownership of an upper object map the intended ownership
+ * according to the upper layer's idmapping. When an upper mount idmaps files
+ * that are stored on-disk as owned by id 1001 to id 1000 this means stat on
+ * this object will report it as being owned by id 1000 when calling stat via
+ * the upper mount.
+ * In order to change ownership of an object so stat reports id 1000 when
+ * called on an idmapped upper mount the value written to disk - i.e., the
+ * value stored in ia_*id - must 1001. The mount mapping helper will thus take
+ * care to map 1000 to 1001.
+ * The mnt idmapping helpers are nops if the upper layer isn't idmapped.
+ */
+static inline int ovl_do_notify_change(struct ovl_fs *ofs,
+ struct dentry *upperdentry,
+ struct iattr *attr)
+{
+ return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL);
+}
+
+static inline int ovl_do_rmdir(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry)
{
- int err = vfs_rmdir(&init_user_ns, dir, dentry);
+ int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
-static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry *dentry)
{
- int err = vfs_unlink(&init_user_ns, dir, dentry, NULL);
+ int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
-static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *new_dentry)
+static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry,
+ struct inode *dir, struct dentry *new_dentry)
{
- int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL);
+ int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL);
pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
return err;
}
-static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_create(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
umode_t mode)
{
- int err = vfs_create(&init_user_ns, dir, dentry, mode, true);
+ int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true);
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
-static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_mkdir(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
umode_t mode)
{
- int err = vfs_mkdir(&init_user_ns, dir, dentry, mode);
+ int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode);
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
-static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_mknod(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev)
{
- int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev);
+ int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev);
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
return err;
}
-static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+static inline int ovl_do_symlink(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry,
const char *oldname)
{
- int err = vfs_symlink(&init_user_ns, dir, dentry, oldname);
+ int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname);
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
-static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, void *value,
- size_t size)
+static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name,
+ void *value, size_t size)
{
- const char *name = ovl_xattr(ofs, ox);
- int err = vfs_getxattr(&init_user_ns, dentry, name, value, size);
- int len = (value && err > 0) ? err : 0;
+ int err, len;
+
+ WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb);
+
+ err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry,
+ name, value, size);
+ len = (value && err > 0) ? err : 0;
pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
- dentry, name, min(len, 48), value, size, err);
+ path->dentry, name, min(len, 48), value, size, err);
return err;
}
+static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs,
+ struct dentry *upperdentry,
+ enum ovl_xattr ox, void *value,
+ size_t size)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size);
+}
+
+static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs,
+ const struct path *path,
+ enum ovl_xattr ox, void *value,
+ size_t size)
+{
+ return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size);
+}
+
static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox, const void *value,
- size_t size)
+ const char *name, const void *value,
+ size_t size, int flags)
{
- const char *name = ovl_xattr(ofs, ox);
- int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0);
- pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
- dentry, name, min((int)size, 48), value, size, err);
+ int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name,
+ value, size, flags);
+
+ pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
+ dentry, name, min((int)size, 48), value, size, flags, err);
return err;
}
+static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox, const void *value,
+ size_t size)
+{
+ return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0);
+}
+
static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
- enum ovl_xattr ox)
+ const char *name)
{
- const char *name = ovl_xattr(ofs, ox);
- int err = vfs_removexattr(&init_user_ns, dentry, name);
+ int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
-static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
- struct inode *newdir, struct dentry *newdentry,
- unsigned int flags)
+static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
+ enum ovl_xattr ox)
+{
+ return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
+}
+
+static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
+ struct dentry *olddentry, struct inode *newdir,
+ struct dentry *newdentry, unsigned int flags)
{
int err;
struct renamedata rd = {
- .old_mnt_userns = &init_user_ns,
+ .old_mnt_userns = ovl_upper_mnt_userns(ofs),
.old_dir = olddir,
.old_dentry = olddentry,
- .new_mnt_userns = &init_user_ns,
+ .new_mnt_userns = ovl_upper_mnt_userns(ofs),
.new_dir = newdir,
.new_dentry = newdentry,
.flags = flags,
@@ -239,22 +302,31 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
return err;
}
-static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+static inline int ovl_do_whiteout(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *dentry)
{
- int err = vfs_whiteout(&init_user_ns, dir, dentry);
+ int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
-static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
+static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs,
+ struct dentry *dentry, umode_t mode)
{
- struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0);
+ struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0);
int err = PTR_ERR_OR_ZERO(ret);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
return ret;
}
+static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
+ const char *name,
+ struct dentry *base, int len)
+{
+ return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len);
+}
+
static inline bool ovl_open_flags_need_copy_up(int flags)
{
if (!flags)
@@ -293,10 +365,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
void ovl_path_lowerdata(struct dentry *dentry, struct path *path);
+void ovl_i_path_real(struct inode *inode, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_lowerdata(struct dentry *dentry);
+const struct ovl_layer *ovl_i_layer_lower(struct inode *inode);
const struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_i_dentry_upper(struct inode *inode);
@@ -326,13 +401,24 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_dentry_version_get(struct dentry *dentry);
bool ovl_is_whiteout(struct dentry *dentry);
-struct file *ovl_path_open(struct path *path, int flags);
+struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry);
-bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
- enum ovl_xattr ox);
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox);
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
+
+static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
+ struct dentry *upperdentry)
+{
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+ return ovl_path_check_origin_xattr(ofs, &upperpath);
+}
+
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
enum ovl_xattr ox, const void *value, size_t size,
int xerr);
@@ -344,10 +430,9 @@ bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry);
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
- int padding);
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
int ovl_sync_status(struct ovl_fs *ofs);
static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
@@ -366,9 +451,15 @@ static inline bool ovl_test_flag(unsigned long flag, struct inode *inode)
}
static inline bool ovl_is_impuredir(struct super_block *sb,
- struct dentry *dentry)
+ struct dentry *upperdentry)
{
- return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_IMPURE);
+ struct ovl_fs *ofs = OVL_FS(sb);
+ struct path upperpath = {
+ .dentry = upperdentry,
+ .mnt = ovl_upper_mnt(ofs),
+ };
+
+ return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
}
/*
@@ -461,12 +552,13 @@ static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
extern const struct file_operations ovl_dir_operations;
struct file *ovl_dir_real_file(const struct file *file, bool want_upper);
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
-void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
+ struct list_head *list);
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
-int ovl_check_d_type_supported(struct path *realpath);
-int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
- struct dentry *dentry, int level);
+int ovl_check_d_type_supported(const struct path *realpath);
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+ struct vfsmount *mnt, struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
/*
@@ -498,7 +590,13 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+
+#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
+#else
+#define ovl_get_acl NULL
+#endif
+
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
bool ovl_is_private_xattr(struct super_block *sb, const char *name);
@@ -520,16 +618,7 @@ bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir);
struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir);
struct inode *ovl_get_inode(struct super_block *sb,
struct ovl_inode_params *oip);
-static inline void ovl_copyattr(struct inode *from, struct inode *to)
-{
- to->i_uid = from->i_uid;
- to->i_gid = from->i_gid;
- to->i_mode = from->i_mode;
- to->i_atime = from->i_atime;
- to->i_mtime = from->i_mtime;
- to->i_ctime = from->i_ctime;
- i_size_write(to, i_size_read(from));
-}
+void ovl_copyattr(struct inode *to);
/* vfs inode flags copied from real to ovl inode */
#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE)
@@ -570,19 +659,22 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
-int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode);
-struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
+int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
+ struct dentry **newdentry, umode_t mode);
+struct dentry *ovl_create_real(struct ovl_fs *ofs,
+ struct inode *dir, struct dentry *newdentry,
+ struct ovl_cattr *attr);
+int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
+struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr);
-int ovl_cleanup(struct inode *dir, struct dentry *dentry);
-struct dentry *ovl_lookup_temp(struct dentry *workdir);
-struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
/* file.c */
extern const struct file_operations ovl_file_operations;
int __init ovl_aio_request_cache_init(void);
void ovl_aio_request_cache_destroy(void);
-int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa);
-int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
+int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int ovl_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
@@ -591,9 +683,8 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns,
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
-int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
- struct dentry *new);
-int ovl_set_attr(struct dentry *upper, struct kstat *stat);
+int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
+int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
bool is_upper);
int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 63efee554f69..e1af8f660698 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -90,6 +90,11 @@ static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)
return ofs->layers[0].mnt;
}
+static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs)
+{
+ return mnt_user_ns(ovl_upper_mnt(ofs));
+}
+
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
{
return (struct ovl_fs *)sb->s_fs_info;
@@ -129,7 +134,7 @@ struct ovl_inode {
unsigned long flags;
struct inode vfs_inode;
struct dentry *__upperdentry;
- struct inode *lower;
+ struct ovl_path lowerpath;
/* synchronize copy up and more */
struct mutex lock;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 150fdf3bc68d..2b210640036c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -170,7 +170,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
return p;
}
-static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
@@ -179,22 +179,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
struct ovl_cache_entry *p;
if (ovl_cache_entry_find_link(name, len, &newp, &parent))
- return 0;
+ return true;
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return -ENOMEM;
+ return false;
}
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, rdd->root);
- return 0;
+ return true;
}
-static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
+static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -211,7 +211,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
list_add_tail(&p->l_node, &rdd->middle);
}
- return rdd->err;
+ return rdd->err == 0;
}
void ovl_cache_free(struct list_head *list)
@@ -250,7 +250,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
}
}
-static int ovl_fill_merge(struct dir_context *ctx, const char *name,
+static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -264,11 +264,11 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
-static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
+static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
- struct dentry *dentry;
+ struct dentry *dentry, *dir = path->dentry;
const struct cred *old_cred;
old_cred = ovl_override_creds(rdd->dentry->d_sb);
@@ -278,7 +278,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
- dentry = lookup_one_len(p->name, dir, p->len);
+ dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
@@ -291,7 +291,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
return err;
}
-static inline int ovl_dir_read(struct path *realpath,
+static inline int ovl_dir_read(const struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
@@ -312,7 +312,7 @@ static inline int ovl_dir_read(struct path *realpath,
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout && rdd->dentry)
- err = ovl_check_whiteouts(realpath->dentry, rdd);
+ err = ovl_check_whiteouts(realpath, rdd);
fput(realfile);
@@ -455,7 +455,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
* copy up origin, call vfs_getattr() on the overlay entry to make
* sure that d_ino will be consistent with st_ino from stat(2).
*/
-static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
+static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p)
{
struct dentry *dir = path->dentry;
@@ -479,7 +479,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
goto get;
}
}
- this = lookup_one_len(p->name, dir, p->len);
+ this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
/* Mark a stale entry */
p->is_whiteout = true;
@@ -528,7 +528,7 @@ fail:
goto out;
}
-static int ovl_fill_plain(struct dir_context *ctx, const char *name,
+static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -540,14 +540,14 @@ static int ovl_fill_plain(struct dir_context *ctx, const char *name,
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL) {
rdd->err = -ENOMEM;
- return -ENOMEM;
+ return false;
}
list_add_tail(&p->l_node, rdd->list);
- return 0;
+ return true;
}
-static int ovl_dir_read_impure(struct path *path, struct list_head *list,
+static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct rb_root *root)
{
int err;
@@ -592,7 +592,7 @@ static int ovl_dir_read_impure(struct path *path, struct list_head *list,
return 0;
}
-static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
+static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
{
int res;
struct dentry *dentry = path->dentry;
@@ -623,8 +623,8 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
* Removing the "impure" xattr is best effort.
*/
if (!ovl_want_write(dentry)) {
- ovl_do_removexattr(ofs, ovl_dentry_upper(dentry),
- OVL_XATTR_IMPURE);
+ ovl_removexattr(ofs, ovl_dentry_upper(dentry),
+ OVL_XATTR_IMPURE);
ovl_drop_write(dentry);
}
ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
@@ -648,7 +648,7 @@ struct ovl_readdir_translate {
bool xinowarn;
};
-static int ovl_fill_real(struct dir_context *ctx, const char *name,
+static bool ovl_fill_real(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -834,7 +834,7 @@ out_unlock:
}
static struct file *ovl_dir_open_realfile(const struct file *file,
- struct path *realpath)
+ const struct path *realpath)
{
struct file *res;
const struct cred *old_cred;
@@ -1001,7 +1001,8 @@ del_entry:
return err;
}
-void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
+ struct list_head *list)
{
struct ovl_cache_entry *p;
@@ -1012,7 +1013,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
if (WARN_ON(!p->is_whiteout || !p->is_upper))
continue;
- dentry = lookup_one_len(p->name, upper, p->len);
+ dentry = ovl_lookup_upper(ofs, p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
@@ -1020,13 +1021,13 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
continue;
}
if (dentry->d_inode)
- ovl_cleanup(upper->d_inode, dentry);
+ ovl_cleanup(ofs, upper->d_inode, dentry);
dput(dentry);
}
inode_unlock(upper->d_inode);
}
-static int ovl_check_d_type(struct dir_context *ctx, const char *name,
+static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -1035,19 +1036,19 @@ static int ovl_check_d_type(struct dir_context *ctx, const char *name,
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
- return 0;
+ return true;
if (d_type != DT_UNKNOWN)
rdd->d_type_supported = true;
- return 0;
+ return true;
}
/*
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
* if error is encountered.
*/
-int ovl_check_d_type_supported(struct path *realpath)
+int ovl_check_d_type_supported(const struct path *realpath)
{
int err;
struct ovl_readdir_data rdd = {
@@ -1064,7 +1065,8 @@ int ovl_check_d_type_supported(struct path *realpath)
#define OVL_INCOMPATDIR_NAME "incompat"
-static int ovl_workdir_cleanup_recurse(struct path *path, int level)
+static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path,
+ int level)
{
int err;
struct inode *dir = path->dentry->d_inode;
@@ -1111,11 +1113,11 @@ static int ovl_workdir_cleanup_recurse(struct path *path, int level)
err = -EINVAL;
break;
}
- dentry = lookup_one_len(p->name, path->dentry, p->len);
+ dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len);
if (IS_ERR(dentry))
continue;
if (dentry->d_inode)
- err = ovl_workdir_cleanup(dir, path->mnt, dentry, level);
+ err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level);
dput(dentry);
if (err)
break;
@@ -1126,24 +1128,24 @@ out:
return err;
}
-int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
- struct dentry *dentry, int level)
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+ struct vfsmount *mnt, struct dentry *dentry, int level)
{
int err;
if (!d_is_dir(dentry) || level > 1) {
- return ovl_cleanup(dir, dentry);
+ return ovl_cleanup(ofs, dir, dentry);
}
- err = ovl_do_rmdir(dir, dentry);
+ err = ovl_do_rmdir(ofs, dir, dentry);
if (err) {
struct path path = { .mnt = mnt, .dentry = dentry };
inode_unlock(dir);
- err = ovl_workdir_cleanup_recurse(&path, level + 1);
+ err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
inode_lock_nested(dir, I_MUTEX_PARENT);
if (!err)
- err = ovl_cleanup(dir, dentry);
+ err = ovl_cleanup(ofs, dir, dentry);
}
return err;
@@ -1179,7 +1181,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
if (p->len == 2 && p->name[1] == '.')
continue;
}
- index = lookup_one_len(p->name, indexdir, p->len);
+ index = ovl_lookup_upper(ofs, p->name, indexdir, p->len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
index = NULL;
@@ -1187,7 +1189,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
}
/* Cleanup leftover from index create/cleanup attempt */
if (index->d_name.name[0] == '#') {
- err = ovl_workdir_cleanup(dir, path.mnt, index, 1);
+ err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1);
if (err)
break;
goto next;
@@ -1197,7 +1199,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
goto next;
} else if (err == -ESTALE) {
/* Cleanup stale index entries */
- err = ovl_cleanup(dir, index);
+ err = ovl_cleanup(ofs, dir, index);
} else if (err != -ENOENT) {
/*
* Abort mount to avoid corrupting the index if
@@ -1213,7 +1215,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
err = ovl_cleanup_and_whiteout(ofs, dir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(dir, index);
+ err = ovl_cleanup(ofs, dir, index);
}
if (err)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 001cdbb8f015..9ca98bea8e18 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -184,7 +184,8 @@ static struct inode *ovl_alloc_inode(struct super_block *sb)
oi->version = 0;
oi->flags = 0;
oi->__upperdentry = NULL;
- oi->lower = NULL;
+ oi->lowerpath.dentry = NULL;
+ oi->lowerpath.layer = NULL;
oi->lowerdata = NULL;
mutex_init(&oi->lock);
@@ -205,7 +206,7 @@ static void ovl_destroy_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
dput(oi->__upperdentry);
- iput(oi->lower);
+ dput(oi->lowerpath.dentry);
if (S_ISDIR(inode->i_mode))
ovl_dir_cache_free(inode);
else
@@ -300,7 +301,7 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
/**
* ovl_statfs
- * @sb: The overlayfs super block
+ * @dentry: The dentry to query
* @buf: The struct kstatfs to fill in with stats
*
* Get the filesystem statistics. As writes always target the upper layer
@@ -348,6 +349,8 @@ static inline int ovl_xino_def(void)
/**
* ovl_show_options
+ * @m: the seq_file handle
+ * @dentry: The dentry to query
*
* Prints the mount options for a given superblock.
* Returns zero; does not fail.
@@ -761,7 +764,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
inode_lock_nested(dir, I_MUTEX_PARENT);
retry:
- work = lookup_one_len(name, ofs->workbasedir, strlen(name));
+ work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
if (!IS_ERR(work)) {
struct iattr attr = {
@@ -778,7 +781,7 @@ retry:
goto out_unlock;
retried = true;
- err = ovl_workdir_cleanup(dir, mnt, work, 0);
+ err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0);
dput(work);
if (err == -EINVAL) {
work = ERR_PTR(err);
@@ -787,7 +790,7 @@ retry:
goto retry;
}
- err = ovl_mkdir_real(dir, &work, attr.ia_mode);
+ err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode);
if (err)
goto out_dput;
@@ -809,19 +812,19 @@ retry:
* allowed as upper are limited to "normal" ones, where checking
* for the above two errors is sufficient.
*/
- err = vfs_removexattr(&init_user_ns, work,
- XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = ovl_do_removexattr(ofs, work,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
- err = vfs_removexattr(&init_user_ns, work,
- XATTR_NAME_POSIX_ACL_ACCESS);
+ err = ovl_do_removexattr(ofs, work,
+ XATTR_NAME_POSIX_ACL_ACCESS);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
/* Clear any inherited mode bits */
inode_lock(work->d_inode);
- err = notify_change(&init_user_ns, work, &attr, NULL);
+ err = ovl_do_notify_change(ofs, work, &attr);
inode_unlock(work->d_inode);
if (err)
goto out_dput;
@@ -873,10 +876,6 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
pr_err("filesystem on '%s' not supported\n", name);
goto out_put;
}
- if (is_idmapped_mnt(path->mnt)) {
- pr_err("idmapped layers are currently not supported\n");
- goto out_put;
- }
if (!d_is_dir(path->dentry)) {
pr_err("'%s' not a directory\n", name);
goto out_put;
@@ -909,7 +908,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
return err;
}
-static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
+static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
const char *name)
{
struct kstatfs statfs;
@@ -1023,7 +1022,20 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
/* Check that everything is OK before copy-up */
if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ /* The above comment can be understood in two ways:
+ *
+ * 1. We just want to check whether the basic POSIX ACL format
+ * is ok. For example, if the header is correct and the size
+ * is sane.
+ * 2. We want to know whether the ACL_{GROUP,USER} entries can
+ * be mapped according to the underlying filesystem.
+ *
+ * Currently, we only check 1. If we wanted to check 2. we
+ * would need to pass the mnt_userns and the fs_userns of the
+ * underlying filesystem. But frankly, I think checking 1. is
+ * enough to start the copy-up.
+ */
+ acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
}
@@ -1256,8 +1268,9 @@ out:
* Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
* negative values if error is encountered.
*/
-static int ovl_check_rename_whiteout(struct dentry *workdir)
+static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
{
+ struct dentry *workdir = ofs->workdir;
struct inode *dir = d_inode(workdir);
struct dentry *temp;
struct dentry *dest;
@@ -1267,12 +1280,12 @@ static int ovl_check_rename_whiteout(struct dentry *workdir)
inode_lock_nested(dir, I_MUTEX_PARENT);
- temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
+ temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
err = PTR_ERR(temp);
if (IS_ERR(temp))
goto out_unlock;
- dest = ovl_lookup_temp(workdir);
+ dest = ovl_lookup_temp(ofs, workdir);
err = PTR_ERR(dest);
if (IS_ERR(dest)) {
dput(temp);
@@ -1281,14 +1294,14 @@ static int ovl_check_rename_whiteout(struct dentry *workdir)
/* Name is inline and stable - using snapshot as a copy helper */
take_dentry_name_snapshot(&name, temp);
- err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
+ err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT);
if (err) {
if (err == -EINVAL)
err = 0;
goto cleanup_temp;
}
- whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
+ whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto cleanup_temp;
@@ -1297,11 +1310,11 @@ static int ovl_check_rename_whiteout(struct dentry *workdir)
/* Best effort cleanup of whiteout and temp file */
if (err)
- ovl_cleanup(dir, whiteout);
+ ovl_cleanup(ofs, dir, whiteout);
dput(whiteout);
cleanup_temp:
- ovl_cleanup(dir, temp);
+ ovl_cleanup(ofs, dir, temp);
release_dentry_name_snapshot(&name);
dput(temp);
dput(dest);
@@ -1312,16 +1325,17 @@ out_unlock:
return err;
}
-static struct dentry *ovl_lookup_or_create(struct dentry *parent,
+static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
+ struct dentry *parent,
const char *name, umode_t mode)
{
size_t len = strlen(name);
struct dentry *child;
inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
- child = lookup_one_len(name, parent, len);
+ child = ovl_lookup_upper(ofs, name, parent, len);
if (!IS_ERR(child) && !child->d_inode)
- child = ovl_create_real(parent->d_inode, child,
+ child = ovl_create_real(ofs, parent->d_inode, child,
OVL_CATTR(mode));
inode_unlock(parent->d_inode);
dput(parent);
@@ -1343,7 +1357,7 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
const char *const *name = volatile_path;
for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) {
- d = ovl_lookup_or_create(d, *name, ctr > 1 ? S_IFDIR : S_IFREG);
+ d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG);
if (IS_ERR(d))
return PTR_ERR(d);
}
@@ -1352,7 +1366,7 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs)
}
static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
- struct path *workpath)
+ const struct path *workpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *temp, *workdir;
@@ -1391,7 +1405,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
pr_warn("upper fs needs to support d_type.\n");
/* Check if upper/work fs supports O_TMPFILE */
- temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0);
+ temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0);
ofs->tmpfile = !IS_ERR(temp);
if (ofs->tmpfile)
dput(temp);
@@ -1400,7 +1414,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
/* Check if upper/work fs supports RENAME_WHITEOUT */
- err = ovl_check_rename_whiteout(ofs->workdir);
+ err = ovl_check_rename_whiteout(ofs);
if (err < 0)
goto out;
@@ -1411,13 +1425,14 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
/*
* Check if upper/work fs supports (trusted|user).overlay.* xattr
*/
- err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
+ err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
if (err) {
+ pr_warn("failed to set xattr on upper\n");
ofs->noxattr = true;
if (ofs->config.index || ofs->config.metacopy) {
ofs->config.index = false;
ofs->config.metacopy = false;
- pr_warn("upper fs does not support xattr, falling back to index=off,metacopy=off.\n");
+ pr_warn("...falling back to index=off,metacopy=off.\n");
}
/*
* xattr support is required for persistent st_ino.
@@ -1425,11 +1440,13 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
*/
if (ofs->config.xino == OVL_XINO_AUTO) {
ofs->config.xino = OVL_XINO_OFF;
- pr_warn("upper fs does not support xattr, falling back to xino=off.\n");
+ pr_warn("...falling back to xino=off.\n");
}
+ if (err == -EPERM && !ofs->config.userxattr)
+ pr_info("try mounting with 'userxattr' option\n");
err = 0;
} else {
- ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
+ ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
}
/*
@@ -1478,7 +1495,7 @@ out:
}
static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs,
- struct path *upperpath)
+ const struct path *upperpath)
{
int err;
struct path workpath = { };
@@ -1521,7 +1538,7 @@ out:
}
static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
- struct ovl_entry *oe, struct path *upperpath)
+ struct ovl_entry *oe, const struct path *upperpath)
{
struct vfsmount *mnt = ovl_upper_mnt(ofs);
struct dentry *indexdir;
@@ -2033,7 +2050,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
atomic_long_set(&ofs->last_ino, 1);
- /* Assume underlaying fs uses 32bit inodes unless proven otherwise */
+ /* Assume underlying fs uses 32bit inodes unless proven otherwise */
if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
if (!ofs->xino_mode) {
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index f48284a2a896..81a57a8d80d9 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -194,6 +194,20 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
return type;
}
+enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ WARN_ON_ONCE(d_is_dir(dentry));
+
+ if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type))
+ ovl_path_lowerdata(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
struct dentry *ovl_dentry_upper(struct dentry *dentry)
{
return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
@@ -236,6 +250,17 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode)
return ovl_upperdentry_dereference(OVL_I(inode));
}
+void ovl_i_path_real(struct inode *inode, struct path *path)
+{
+ path->dentry = ovl_i_dentry_upper(inode);
+ if (!path->dentry) {
+ path->dentry = OVL_I(inode)->lowerpath.dentry;
+ path->mnt = OVL_I(inode)->lowerpath.layer->mnt;
+ } else {
+ path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb));
+ }
+}
+
struct inode *ovl_inode_upper(struct inode *inode)
{
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
@@ -245,7 +270,9 @@ struct inode *ovl_inode_upper(struct inode *inode)
struct inode *ovl_inode_lower(struct inode *inode)
{
- return OVL_I(inode)->lower;
+ struct dentry *lowerdentry = OVL_I(inode)->lowerpath.dentry;
+
+ return lowerdentry ? d_inode(lowerdentry) : NULL;
}
struct inode *ovl_inode_real(struct inode *inode)
@@ -443,7 +470,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
void ovl_dir_modified(struct dentry *dentry, bool impurity)
{
/* Copy mtime/ctime */
- ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry));
+ ovl_copyattr(d_inode(dentry));
ovl_dir_version_inc(dentry, impurity);
}
@@ -463,9 +490,10 @@ bool ovl_is_whiteout(struct dentry *dentry)
return inode && IS_WHITEOUT(inode);
}
-struct file *ovl_path_open(struct path *path, int flags)
+struct file *ovl_path_open(const struct path *path, int flags)
{
struct inode *inode = d_inode(path->dentry);
+ struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt);
int err, acc_mode;
if (flags & ~(O_ACCMODE | O_LARGEFILE))
@@ -482,12 +510,12 @@ struct file *ovl_path_open(struct path *path, int flags)
BUG();
}
- err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN);
+ err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN);
if (err)
return ERR_PTR(err);
/* O_NOATIME is an optimization, don't fail if not permitted */
- if (inode_owner_or_capable(&init_user_ns, inode))
+ if (inode_owner_or_capable(real_mnt_userns, inode))
flags |= O_NOATIME;
return dentry_open(path, flags, current_cred());
@@ -550,11 +578,11 @@ void ovl_copy_up_end(struct dentry *dentry)
ovl_inode_unlock(d_inode(dentry));
}
-bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry)
+bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
{
int res;
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_ORIGIN, NULL, 0);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0);
/* Zero size value means "copied up but origin unknown" */
if (res >= 0)
@@ -563,16 +591,16 @@ bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry)
return false;
}
-bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
- enum ovl_xattr ox)
+bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox)
{
int res;
char val;
- if (!d_is_dir(dentry))
+ if (!d_is_dir(path->dentry))
return false;
- res = ovl_do_getxattr(OVL_FS(sb), dentry, ox, &val, 1);
+ res = ovl_path_getxattr(ofs, path, ox, &val, 1);
if (res == 1 && val == 'y')
return true;
@@ -612,7 +640,7 @@ int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
if (ofs->noxattr)
return xerr;
- err = ovl_do_setxattr(ofs, upperdentry, ox, value, size);
+ err = ovl_setxattr(ofs, upperdentry, ox, value, size);
if (err == -EOPNOTSUPP) {
pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox));
@@ -652,8 +680,8 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper)
char buf[OVL_PROTATTR_MAX+1];
int res, n;
- res = ovl_do_getxattr(ofs, upper, OVL_XATTR_PROTATTR, buf,
- OVL_PROTATTR_MAX);
+ res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf,
+ OVL_PROTATTR_MAX);
if (res < 0)
return;
@@ -708,7 +736,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR,
buf, len, -EPERM);
} else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) {
- err = ovl_do_removexattr(ofs, upper, OVL_XATTR_PROTATTR);
+ err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR);
if (err == -EOPNOTSUPP || err == -ENODATA)
err = 0;
}
@@ -824,7 +852,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
}
inode_lock_nested(dir, I_MUTEX_PARENT);
- index = lookup_one_len(name.name, indexdir, name.len);
+ index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
err = PTR_ERR(index);
if (IS_ERR(index)) {
index = NULL;
@@ -834,7 +862,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
dir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(dir, index);
+ err = ovl_cleanup(ofs, dir, index);
}
inode_unlock(dir);
@@ -943,15 +971,15 @@ err:
}
/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry)
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path)
{
int res;
/* Only regular files can have metacopy xattr */
- if (!S_ISREG(d_inode(dentry)->i_mode))
+ if (!S_ISREG(d_inode(path->dentry)->i_mode))
return 0;
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_METACOPY, NULL, 0);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return 0;
@@ -987,13 +1015,12 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
return (oe->numlower > 1);
}
-char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
- int padding)
+char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding)
{
int res;
char *s, *next, *buf = NULL;
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, NULL, 0);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0);
if (res == -ENODATA || res == -EOPNOTSUPP)
return NULL;
if (res < 0)
@@ -1005,7 +1032,7 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
if (!buf)
return ERR_PTR(-ENOMEM);
- res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, buf, res);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res);
if (res < 0)
goto fail;
if (res == 0)
@@ -1060,3 +1087,33 @@ int ovl_sync_status(struct ovl_fs *ofs)
return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq);
}
+
+/*
+ * ovl_copyattr() - copy inode attributes from layer to ovl inode
+ *
+ * When overlay copies inode information from an upper or lower layer to the
+ * relevant overlay inode it will apply the idmapping of the upper or lower
+ * layer when doing so ensuring that the ovl inode ownership will correctly
+ * reflect the ownership of the idmapped upper or lower layer. For example, an
+ * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to
+ * map any lower or upper inode owned by id 1001 to id 1000. These mapping
+ * helpers are nops when the relevant layer isn't idmapped.
+ */
+void ovl_copyattr(struct inode *inode)
+{
+ struct path realpath;
+ struct inode *realinode;
+ struct user_namespace *real_mnt_userns;
+
+ ovl_i_path_real(inode, &realpath);
+ realinode = d_inode(realpath.dentry);
+ real_mnt_userns = mnt_user_ns(realpath.mnt);
+
+ inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
+ inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+ inode->i_mode = realinode->i_mode;
+ inode->i_atime = realinode->i_atime;
+ inode->i_mtime = realinode->i_mtime;
+ inode->i_ctime = realinode->i_ctime;
+ i_size_write(inode, i_size_read(realinode));
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index 9648ac15164a..42c7ff41c2db 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -653,7 +653,7 @@ pipe_poll(struct file *filp, poll_table *wait)
unsigned int head, tail;
/* Epoll has some historical nasty semantics, this enables them */
- pipe->poll_usage = 1;
+ WRITE_ONCE(pipe->poll_usage, true);
/*
* Reading pipe state only -- no need for acquiring the semaphore.
@@ -804,7 +804,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
- pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer),
+ pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
if (pipe->bufs) {
@@ -849,7 +849,7 @@ void free_pipe_info(struct pipe_inode_info *pipe)
#endif
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
- kvfree(pipe->bufs);
+ kfree(pipe->bufs);
kfree(pipe);
}
@@ -860,7 +860,7 @@ static struct vfsmount *pipe_mnt __read_mostly;
*/
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ return dynamic_dname(buffer, buflen, "pipe:[%lu]",
d_inode(dentry)->i_ino);
}
@@ -1245,28 +1245,32 @@ unsigned int round_pipe_size(unsigned long size)
/*
* Resize the pipe ring to a number of slots.
+ *
+ * Note the pipe can be reduced in capacity, but only if the current
+ * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
+ * returned instead.
*/
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;
unsigned int head, tail, mask, n;
- /*
- * We can shrink the pipe, if arg is greater than the ring occupancy.
- * Since we don't expect a lot of shrink+grow operations, just free and
- * allocate again like we would do for growing. If the pipe currently
- * contains more buffers than arg, then return busy.
- */
+ bufs = kcalloc(nr_slots, sizeof(*bufs),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (unlikely(!bufs))
+ return -ENOMEM;
+
+ spin_lock_irq(&pipe->rd_wait.lock);
mask = pipe->ring_size - 1;
head = pipe->head;
tail = pipe->tail;
- n = pipe_occupancy(pipe->head, pipe->tail);
- if (nr_slots < n)
- return -EBUSY;
- bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT);
- if (unlikely(!bufs))
- return -ENOMEM;
+ n = pipe_occupancy(head, tail);
+ if (nr_slots < n) {
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ kfree(bufs);
+ return -EBUSY;
+ }
/*
* The pipe array wraps around, so just start the new one at zero
@@ -1291,7 +1295,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
head = n;
tail = 0;
- kvfree(pipe->bufs);
+ kfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
if (pipe->max_usage > nr_slots)
@@ -1299,6 +1303,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
pipe->tail = tail;
pipe->head = head;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+
/* This might have made more room for writers */
wake_up_interruptible(&pipe->wr_wait);
return 0;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 80acb6885cf9..b4f109875e79 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc);
/*
* Clone an ACL.
*/
-static struct posix_acl *
+struct posix_acl *
posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
{
struct posix_acl *clone = NULL;
@@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
}
return clone;
}
+EXPORT_SYMBOL_GPL(posix_acl_clone);
/*
* Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
@@ -360,9 +361,10 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
const struct posix_acl *acl, int want)
{
const struct posix_acl_entry *pa, *pe, *mask_obj;
+ struct user_namespace *fs_userns = i_user_ns(inode);
int found = 0;
- kuid_t uid;
- kgid_t gid;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
want &= MAY_READ | MAY_WRITE | MAY_EXEC;
@@ -370,30 +372,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
switch(pa->e_tag) {
case ACL_USER_OBJ:
/* (May have been checked already) */
- uid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(uid, current_fsuid()))
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
goto check_perm;
break;
case ACL_USER:
- uid = mapped_kuid_fs(mnt_userns,
- i_user_ns(inode),
+ vfsuid = make_vfsuid(mnt_userns, fs_userns,
pa->e_uid);
- if (uid_eq(uid, current_fsuid()))
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
goto mask;
break;
case ACL_GROUP_OBJ:
- gid = i_gid_into_mnt(mnt_userns, inode);
- if (in_group_p(gid)) {
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (vfsgid_in_group_p(vfsgid)) {
found = 1;
if ((pa->e_perm & want) == want)
goto mask;
}
break;
case ACL_GROUP:
- gid = mapped_kgid_fs(mnt_userns,
- i_user_ns(inode),
+ vfsgid = make_vfsgid(mnt_userns, fs_userns,
pa->e_gid);
- if (in_group_p(gid)) {
+ if (vfsgid_in_group_p(vfsgid)) {
found = 1;
if ((pa->e_perm & want) == want)
goto mask;
@@ -699,7 +699,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
@@ -710,46 +710,89 @@ EXPORT_SYMBOL(posix_acl_update_mode);
/*
* Fix up the uids and gids in posix acl extended attributes in place.
*/
-static void posix_acl_fix_xattr_userns(
- struct user_namespace *to, struct user_namespace *from,
- struct user_namespace *mnt_userns,
- void *value, size_t size, bool from_user)
+static int posix_acl_fix_xattr_common(const void *value, size_t size)
+{
+ const struct posix_acl_xattr_header *header = value;
+ int count;
+
+ if (!header)
+ return -EINVAL;
+ if (size < sizeof(struct posix_acl_xattr_header))
+ return -EINVAL;
+ if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+ return -EOPNOTSUPP;
+
+ count = posix_acl_xattr_count(size);
+ if (count < 0)
+ return -EINVAL;
+ if (count == 0)
+ return 0;
+
+ return count;
+}
+
+void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode,
+ void *value, size_t size)
{
struct posix_acl_xattr_header *header = value;
struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+ struct user_namespace *fs_userns = i_user_ns(inode);
int count;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
kuid_t uid;
kgid_t gid;
- if (!value)
- return;
- if (size < sizeof(struct posix_acl_xattr_header))
- return;
- if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
return;
- count = posix_acl_xattr_count(size);
- if (count < 0)
+ count = posix_acl_fix_xattr_common(value, size);
+ if (count <= 0)
return;
- if (count == 0)
+
+ for (end = entry + count; entry != end; entry++) {
+ switch (le16_to_cpu(entry->e_tag)) {
+ case ACL_USER:
+ uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, uid);
+ entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
+ vfsuid_into_kuid(vfsuid)));
+ break;
+ case ACL_GROUP:
+ gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, gid);
+ entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
+ vfsgid_into_kgid(vfsgid)));
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void posix_acl_fix_xattr_userns(
+ struct user_namespace *to, struct user_namespace *from,
+ void *value, size_t size)
+{
+ struct posix_acl_xattr_header *header = value;
+ struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+ int count;
+ kuid_t uid;
+ kgid_t gid;
+
+ count = posix_acl_fix_xattr_common(value, size);
+ if (count <= 0)
return;
for (end = entry + count; entry != end; entry++) {
switch(le16_to_cpu(entry->e_tag)) {
case ACL_USER:
uid = make_kuid(from, le32_to_cpu(entry->e_id));
- if (from_user)
- uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid);
- else
- uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid);
entry->e_id = cpu_to_le32(from_kuid(to, uid));
break;
case ACL_GROUP:
gid = make_kgid(from, le32_to_cpu(entry->e_id));
- if (from_user)
- gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid);
- else
- gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid);
entry->e_id = cpu_to_le32(from_kgid(to, gid));
break;
default:
@@ -758,32 +801,48 @@ static void posix_acl_fix_xattr_userns(
}
}
-void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
- void *value, size_t size)
+void posix_acl_fix_xattr_from_user(void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
- if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
+ if (user_ns == &init_user_ns)
return;
- posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
- size, true);
+ posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
}
-void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
- void *value, size_t size)
+void posix_acl_fix_xattr_to_user(void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
- if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
+ if (user_ns == &init_user_ns)
return;
- posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
- size, false);
+ posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
}
-/*
- * Convert from extended attribute to in-memory representation.
+/**
+ * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the
+ * provided callbacks to map ACL_{GROUP,USER} entries into the
+ * appropriate format
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @value: the uapi representation of POSIX ACLs
+ * @size: the size of @void
+ * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries
+ * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries
+ *
+ * The make_posix_acl() helper is an abstraction to translate from uapi format
+ * into the VFS format allowing the caller to specific callbacks to map
+ * ACL_{GROUP,USER} entries into the expected format. This is used in
+ * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code
+ * duplication.
+ *
+ * Return: Allocated struct posix_acl on success, NULL for a valid header but
+ * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
*/
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *user_ns,
- const void *value, size_t size)
+static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns, const void *value, size_t size,
+ kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *,
+ const struct posix_acl_xattr_entry *),
+ kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *,
+ const struct posix_acl_xattr_entry *))
{
const struct posix_acl_xattr_header *header = value;
const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
@@ -791,16 +850,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns,
struct posix_acl *acl;
struct posix_acl_entry *acl_e;
- if (!value)
- return NULL;
- if (size < sizeof(struct posix_acl_xattr_header))
- return ERR_PTR(-EINVAL);
- if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
- return ERR_PTR(-EOPNOTSUPP);
-
- count = posix_acl_xattr_count(size);
+ count = posix_acl_fix_xattr_common(value, size);
if (count < 0)
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(count);
if (count == 0)
return NULL;
@@ -821,16 +873,12 @@ posix_acl_from_xattr(struct user_namespace *user_ns,
break;
case ACL_USER:
- acl_e->e_uid =
- make_kuid(user_ns,
- le32_to_cpu(entry->e_id));
+ acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry);
if (!uid_valid(acl_e->e_uid))
goto fail;
break;
case ACL_GROUP:
- acl_e->e_gid =
- make_kgid(user_ns,
- le32_to_cpu(entry->e_id));
+ acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry);
if (!gid_valid(acl_e->e_gid))
goto fail;
break;
@@ -845,6 +893,181 @@ fail:
posix_acl_release(acl);
return ERR_PTR(-EINVAL);
}
+
+/**
+ * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and
+ * filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_USER entry in POSIX ACL uapi format
+ *
+ * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id
+ * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through
+ * KUIDT_INIT() and then map it according to the idmapped mount. The resulting
+ * kuid_t is the value which the filesystem can map up into a raw backing store
+ * id in the filesystem's idmapping.
+ *
+ * This is used in vfs_set_acl_prepare() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_USER entries during setxattr().
+ *
+ * Return: A kuid in @fs_userns for the uid stored in @e.
+ */
+static inline kuid_t
+vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ const struct posix_acl_xattr_entry *e)
+{
+ kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id));
+ return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid));
+}
+
+/**
+ * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and
+ * filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_GROUP entry in POSIX ACL uapi format
+ *
+ * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id
+ * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through
+ * KGIDT_INIT() and then map it according to the idmapped mount. The resulting
+ * kgid_t is the value which the filesystem can map up into a raw backing store
+ * id in the filesystem's idmapping.
+ *
+ * This is used in vfs_set_acl_prepare() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_GROUP entries during setxattr().
+ *
+ * Return: A kgid in @fs_userns for the gid stored in @e.
+ */
+static inline kgid_t
+vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ const struct posix_acl_xattr_entry *e)
+{
+ kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id));
+ return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid));
+}
+
+/**
+ * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking
+ * mount and filesystem idmappings into account
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @value: the uapi representation of POSIX ACLs
+ * @size: the size of @void
+ *
+ * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be
+ * mapped according to the relevant mount- and filesystem idmapping. It is
+ * important that the ACL_{GROUP,USER} entries in struct posix_acl will be
+ * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem
+ * idmapping. This is crucial since the resulting struct posix_acl might be
+ * cached filesystem wide. The vfs_set_acl_prepare() function will take care to
+ * perform all necessary idmappings.
+ *
+ * Note, that since basically forever the {g,u}id values encoded as
+ * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain
+ * values that have been mapped according to the caller's idmapping. In other
+ * words, POSIX ACLs passed in uapi format as @value during setxattr() contain
+ * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have
+ * been stored as k{g,u}id_t.
+ *
+ * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by
+ * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t
+ * through from_vfs{g,u}id() to account for any idmapped mounts. The
+ * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the
+ * correct k{g,u}id_t.
+ *
+ * The filesystem will then receive the POSIX ACLs ready to be cached
+ * filesystem wide and ready to be written to the backing store taking the
+ * filesystem's idmapping into account.
+ *
+ * Return: Allocated struct posix_acl on success, NULL for a valid header but
+ * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
+ */
+struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ const void *value, size_t size)
+{
+ return make_posix_acl(mnt_userns, fs_userns, value, size,
+ vfs_set_acl_prepare_kuid,
+ vfs_set_acl_prepare_kgid);
+}
+EXPORT_SYMBOL(vfs_set_acl_prepare);
+
+/**
+ * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping
+ * @mnt_userns: unused
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_USER entry in POSIX ACL uapi format
+ *
+ * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping.
+ * This is used in posix_acl_from_xattr() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_USER entries.
+ *
+ * Return: A kuid in @fs_userns for the uid stored in @e.
+ */
+static inline kuid_t
+posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ const struct posix_acl_xattr_entry *e)
+{
+ return make_kuid(fs_userns, le32_to_cpu(e->e_id));
+}
+
+/**
+ * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping
+ * @mnt_userns: unused
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_GROUP entry in POSIX ACL uapi format
+ *
+ * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping.
+ * This is used in posix_acl_from_xattr() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_GROUP entries.
+ *
+ * Return: A kgid in @fs_userns for the gid stored in @e.
+ */
+static inline kgid_t
+posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ const struct posix_acl_xattr_entry *e)
+{
+ return make_kgid(fs_userns, le32_to_cpu(e->e_id));
+}
+
+/**
+ * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
+ * @fs_userns: the filesystem's idmapping
+ * @value: the uapi representation of POSIX ACLs
+ * @size: the size of @void
+ *
+ * Filesystems that store POSIX ACLs in the unaltered uapi format should use
+ * posix_acl_from_xattr() when reading them from the backing store and
+ * converting them into the struct posix_acl VFS format. The helper is
+ * specifically intended to be called from the ->get_acl() inode operation.
+ *
+ * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
+ * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The
+ * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the
+ * correct k{g,u}id_t. The returned struct posix_acl can be cached.
+ *
+ * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
+ * If it did it calling is from the ->get_acl() inode operation would return
+ * POSIX ACLs mapped according to an idmapped mount which would mean that the
+ * value couldn't be cached for the filesystem. Idmapped mounts are taken into
+ * account on the fly during permission checking or right at the VFS -
+ * userspace boundary before reporting them to the user.
+ *
+ * Return: Allocated struct posix_acl on success, NULL for a valid header but
+ * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
+ */
+struct posix_acl *
+posix_acl_from_xattr(struct user_namespace *fs_userns,
+ const void *value, size_t size)
+{
+ return make_posix_acl(&init_user_ns, fs_userns, value, size,
+ posix_acl_from_xattr_kuid,
+ posix_acl_from_xattr_kgid);
+}
EXPORT_SYMBOL (posix_acl_from_xattr);
/*
@@ -948,7 +1171,17 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
int ret;
if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ /*
+ * By the time we end up here the {g,u}ids stored in
+ * ACL_{GROUP,USER} have already been mapped according to the
+ * caller's idmapping. The vfs_set_acl_prepare() helper will
+ * recover them and take idmapped mounts into account. The
+ * filesystem will receive the POSIX ACLs in the correct
+ * format ready to be cached or written to the backing store
+ * taking the filesystem idmapping into account.
+ */
+ acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode),
+ value, size);
if (IS_ERR(acl))
return PTR_ERR(acl);
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7d1c3114d496..49283b8103c7 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -69,7 +69,6 @@
#include <linux/sched/cputime.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
-#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
@@ -100,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
char tcomm[64];
+ /*
+ * Test before PF_KTHREAD because all workqueue worker threads are
+ * kernel threads.
+ */
if (p->flags & PF_WQ_WORKER)
wq_worker_comm(tcomm, sizeof(tcomm), p);
else if (p->flags & PF_KTHREAD)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c1031843cc6a..2d9429bf51fa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1761,7 +1761,7 @@ out:
return ERR_PTR(error);
}
-static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
@@ -1885,7 +1885,7 @@ void proc_pid_evict_inode(struct proc_inode *ei)
put_pid(pid);
}
-struct inode *proc_pid_make_inode(struct super_block * sb,
+struct inode *proc_pid_make_inode(struct super_block *sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
@@ -1914,11 +1914,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* Let the pid remember us for quick removal */
ei->pid = pid;
- if (S_ISDIR(mode)) {
- spin_lock(&pid->lock);
- hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
- spin_unlock(&pid->lock);
- }
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -1931,6 +1926,39 @@ out_unlock:
return NULL;
}
+/*
+ * Generating an inode and adding it into @pid->inodes, so that task will
+ * invalidate inode's dentry before being released.
+ *
+ * This helper is used for creating dir-type entries under '/proc' and
+ * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
+ * can be released by invalidating '/proc/<tgid>' dentry.
+ * In theory, dentries under '/proc/<tgid>/task' can also be released by
+ * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
+ * thread exiting situation: Any one of threads should invalidate its
+ * '/proc/<tgid>/task/<pid>' dentry before released.
+ */
+static struct inode *proc_pid_make_base_inode(struct super_block *sb,
+ struct task_struct *task, umode_t mode)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct pid *pid;
+
+ inode = proc_pid_make_inode(sb, task, mode);
+ if (!inode)
+ return NULL;
+
+ /* Let proc_flush_pid find this directory inode */
+ ei = PROC_I(inode);
+ pid = ei->pid;
+ spin_lock(&pid->lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->lock);
+
+ return inode;
+}
+
int pid_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
@@ -2700,7 +2728,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
return -ESRCH;
length = security_getprocattr(task, PROC_I(inode)->op.lsm,
- (char*)file->f_path.dentry->d_name.name,
+ file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
if (length > 0)
@@ -3154,6 +3182,22 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_LIVEPATCH */
+#ifdef CONFIG_KSM
+static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct mm_struct *mm;
+
+ mm = get_task_mm(task);
+ if (mm) {
+ seq_printf(m, "%lu\n", mm->ksm_merging_pages);
+ mmput(mm);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_KSM */
+
#ifdef CONFIG_STACKLEAK_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -3285,6 +3329,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3350,7 +3397,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
@@ -3618,6 +3666,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+#ifdef CONFIG_KSM
+ ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3649,7 +3700,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ inode = proc_pid_make_base_inode(dentry->d_sb, task,
+ S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
return ERR_PTR(-ENOENT);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 419760fd77bd..f38bda5b83ec 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -5,14 +5,10 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-__weak void arch_freq_prepare_all(void)
-{
-}
-
extern const struct seq_operations cpuinfo_op;
+
static int cpuinfo_open(struct inode *inode, struct file *file)
{
- arch_freq_prepare_all();
return seq_open(file, &cpuinfo_op);
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b31..913bef0d2a36 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f2132407e133..587b91d9d998 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -448,6 +448,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
ent->proc_dops = &proc_misc_dentry_ops;
+ /* Revalidate everything under /proc/${pid}/net */
+ if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ pde_force_lookup(ent);
out:
return ent;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 73aeb4e6d32e..f495fdb39151 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -26,8 +26,6 @@
#include <linux/mount.h>
#include <linux/bug.h>
-#include <linux/uaccess.h>
-
#include "internal.h"
static void proc_evict_inode(struct inode *inode)
@@ -214,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde)
complete(pde->pde_unload_completion);
}
-/* pde is locked on entry, unlocked on exit */
+/*
+ * At most 2 contexts can enter this function: the one doing the last
+ * close on the descriptor and whoever is deleting PDE itself.
+ *
+ * First to enter calls ->proc_release hook and signals its completion
+ * to the second one which waits and then does nothing.
+ *
+ * PDE is locked on entry, unlocked on exit.
+ */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
__releases(&pde->pde_unload_lock)
{
@@ -224,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
*
* rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
* "struct file" needs to be available at the right moment.
- *
- * Therefore, first process to enter this function does ->release() and
- * signals its completion to the other process which does nothing.
*/
if (pdeo->closing) {
/* somebody else is doing that, just wait */
@@ -240,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
+
file = pdeo->file;
pde->proc_ops->proc_release(file_inode(file), file);
+
spin_lock(&pde->pde_unload_lock);
- /* After ->release. */
+ /* Strictly after ->proc_release, see above. */
list_del(&pdeo->lh);
c = pdeo->c;
spin_unlock(&pde->pde_unload_lock);
@@ -489,6 +494,9 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
+ if (!pde->proc_ops->proc_lseek)
+ file->f_mode &= ~FMODE_LSEEK;
+
if (pde_is_permanent(pde)) {
open = pde->proc_ops->proc_open;
if (open)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 982e694aae77..dff921f7ca33 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -479,10 +479,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
* the previous entry, search for a matching entry.
*/
if (!m || start < m->addr || start >= m->addr + m->size) {
- list_for_each_entry(m, &kclist_head, list) {
- if (start >= m->addr &&
- start < m->addr + m->size)
+ struct kcore_list *iter;
+
+ m = NULL;
+ list_for_each_entry(iter, &kclist_head, list) {
+ if (start >= iter->addr &&
+ start < iter->addr + iter->size) {
+ m = iter;
break;
+ }
}
}
@@ -492,12 +497,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
page_offline_freeze();
}
- if (&m->list == &kclist_head) {
+ if (!m) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
}
- m = NULL; /* skip the list anchor */
goto skip;
}
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index b38ad552887f..592e6dc7c110 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,7 +15,6 @@
#include <linux/fs.h>
#include <linux/syslog.h>
-#include <linux/uaccess.h>
#include <asm/io.h>
extern wait_queue_head_t log_wait;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6fa761c9cc78..208efd4fa52c 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
+#ifdef CONFIG_ZSWAP
+ seq_printf(m, "Zswap: %8lu kB\n",
+ (unsigned long)(zswap_pool_total_size >> 10));
+ seq_printf(m, "Zswapped: %8lu kB\n",
+ (unsigned long)atomic_read(&zswap_stored_pages) <<
+ (PAGE_SHIFT - 10));
+#endif
show_val_kb(m, "Dirty: ",
global_node_page_state(NR_FILE_DIRTY));
show_val_kb(m, "Writeback: ",
@@ -108,6 +115,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
show_val_kb(m, "PageTables: ",
global_node_page_state(NR_PAGETABLE));
+ show_val_kb(m, "SecPageTables: ",
+ global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 13452b32e2bd..4d3493579458 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
-#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/div64.h>
#include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index e1cfeda397f3..856839b8ae8b 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,9 +8,6 @@
*
* proc net directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -353,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net)
kgid_t gid;
int err;
+ /*
+ * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
+ * Corresponding inode (PDE(inode) == net->proc_net) is never
+ * instantiated therefore blanket zeroing is fine.
+ * net->proc_net_stat inode is instantiated normally.
+ */
err = -ENOMEM;
netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
if (!netd)
@@ -376,6 +379,9 @@ static __net_init int proc_net_ns_init(struct net *net)
proc_set_user(netd, uid, gid);
+ /* Seed dentry revalidation for /proc/${pid}/net */
+ pde_force_lookup(netd);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 7d9cfc730bd4..021e83fe831f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -19,6 +19,9 @@
#include <linux/kmemleak.h>
#include "internal.h"
+#define list_for_each_table_entry(entry, table) \
+ for ((entry) = (table); (entry)->procname; (entry)++)
+
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
@@ -26,7 +29,7 @@ static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
/* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 };
+const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
EXPORT_SYMBOL(sysctl_vals);
const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX };
@@ -215,15 +218,19 @@ static void init_header(struct ctl_table_header *head,
INIT_HLIST_HEAD(&head->inodes);
if (node) {
struct ctl_table *entry;
- for (entry = table; entry->procname; entry++, node++)
+
+ list_for_each_table_entry(entry, table) {
node->header = head;
+ node++;
+ }
}
}
static void erase_header(struct ctl_table_header *head)
{
struct ctl_table *entry;
- for (entry = head->ctl_table; entry->procname; entry++)
+
+ list_for_each_table_entry(entry, head->ctl_table)
erase_entry(head, entry);
}
@@ -248,7 +255,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
err = insert_links(header);
if (err)
goto fail_links;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -978,7 +985,6 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
table = (struct ctl_table *)(node + 1);
new_name = (char *)(table + 2);
memcpy(new_name, name, namelen);
- new_name[namelen] = '\0';
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
init_header(&new->header, set->dir.header.root, set, node, table);
@@ -1130,35 +1136,36 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
static int sysctl_check_table(const char *path, struct ctl_table *table)
{
+ struct ctl_table *entry;
int err = 0;
- for (; table->procname; table++) {
- if (table->child)
- err |= sysctl_err(path, table, "Not a file");
-
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_douintvec) ||
- (table->proc_handler == proc_douintvec_minmax) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dou8vec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- err |= sysctl_err(path, table, "No data");
- if (!table->maxlen)
- err |= sysctl_err(path, table, "No maxlen");
+ list_for_each_table_entry(entry, table) {
+ if (entry->child)
+ err |= sysctl_err(path, entry, "Not a file");
+
+ if ((entry->proc_handler == proc_dostring) ||
+ (entry->proc_handler == proc_dointvec) ||
+ (entry->proc_handler == proc_douintvec) ||
+ (entry->proc_handler == proc_douintvec_minmax) ||
+ (entry->proc_handler == proc_dointvec_minmax) ||
+ (entry->proc_handler == proc_dou8vec_minmax) ||
+ (entry->proc_handler == proc_dointvec_jiffies) ||
+ (entry->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (entry->proc_handler == proc_dointvec_ms_jiffies) ||
+ (entry->proc_handler == proc_doulongvec_minmax) ||
+ (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!entry->data)
+ err |= sysctl_err(path, entry, "No data");
+ if (!entry->maxlen)
+ err |= sysctl_err(path, entry, "No maxlen");
else
- err |= sysctl_check_table_array(path, table);
+ err |= sysctl_check_table_array(path, entry);
}
- if (!table->proc_handler)
- err |= sysctl_err(path, table, "No proc_handler");
+ if (!entry->proc_handler)
+ err |= sysctl_err(path, entry, "No proc_handler");
- if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
- err |= sysctl_err(path, table, "bogus .mode 0%o",
- table->mode);
+ if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode)
+ err |= sysctl_err(path, entry, "bogus .mode 0%o",
+ entry->mode);
}
return err;
}
@@ -1174,7 +1181,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
name_bytes = 0;
nr_entries = 0;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
@@ -1191,14 +1198,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
node = (struct ctl_node *)(links + 1);
link_table = (struct ctl_table *)(node + nr_entries);
link_name = (char *)&link_table[nr_entries + 1];
+ link = link_table;
- for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ list_for_each_table_entry(entry, table) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
link->data = link_root;
link_name += len;
+ link++;
}
init_header(links, dir->header.root, dir->header.set, node, link_table);
links->nreg = nr_entries;
@@ -1213,7 +1222,7 @@ static bool get_links(struct ctl_dir *dir,
struct ctl_table *entry, *link;
/* Are there links available for every entry in table? */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
if (!link)
@@ -1226,7 +1235,7 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
const char *procname = entry->procname;
link = find_entry(&head, dir, procname, strlen(procname));
head->nreg++;
@@ -1329,11 +1338,11 @@ struct ctl_table_header *__register_sysctl_table(
struct ctl_node *node;
int nr_entries = 0;
- for (entry = table; entry->procname; entry++)
+ list_for_each_table_entry(entry, table)
nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
@@ -1456,7 +1465,7 @@ static int count_subheaders(struct ctl_table *table)
if (!table || !table->procname)
return 1;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_subheaders += count_subheaders(entry->child);
else
@@ -1475,7 +1484,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
int nr_dirs = 0;
int err = -ENOMEM;
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
if (entry->child)
nr_dirs++;
else
@@ -1492,7 +1501,9 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
goto out;
ctl_table_arg = files;
- for (new = files, entry = table; entry->procname; entry++) {
+ new = files;
+
+ list_for_each_table_entry(entry, table) {
if (entry->child)
continue;
*new = *entry;
@@ -1516,7 +1527,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos,
}
/* Recurse into the subdirectories. */
- for (entry = table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, table) {
char *child_pos;
if (!entry->child)
@@ -1670,7 +1681,7 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- for (entry = header->ctl_table; entry->procname; entry++) {
+ list_for_each_table_entry(entry, header->ctl_table) {
struct ctl_table_header *link_head;
struct ctl_table *link;
const char *name = entry->procname;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index c69ff191e5d8..5c6a5ceab2f1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,8 +4,6 @@
*
* Copyright 1997, Theodore Ts'o
*/
-
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..3c2ee3eb1138 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,9 +6,6 @@
*
* proc root directory handling functions
*/
-
-#include <linux/uaccess.h>
-
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -305,6 +302,11 @@ void __init proc_root_init(void)
proc_mkdir("bus", NULL);
proc_sys_init();
+ /*
+ * Last things last. It is not like userspace processes eager
+ * to open /proc files exist at this point but register last
+ * anyway.
+ */
register_filesystem(&proc_fs_type);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f46060eb91b5..4e0023643f8b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -406,6 +406,7 @@ struct mem_size_stats {
u64 pss_anon;
u64 pss_file;
u64 pss_shmem;
+ u64 pss_dirty;
u64 pss_locked;
u64 swap_pss;
};
@@ -427,6 +428,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
mss->pss_locked += pss;
if (dirty || PageDirty(page)) {
+ mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
else
@@ -525,10 +527,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
- bool migration = false;
+ bool migration = false, young = false, dirty = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
+ young = pte_young(*pte);
+ dirty = pte_dirty(*pte);
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
@@ -558,8 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
- locked, migration);
+ smaps_account(mss, page, false, young, dirty, locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -808,6 +811,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
{
SEQ_PUT_DEC("Rss: ", mss->resident);
SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT);
+ SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT);
if (rollup_mode) {
/*
* These are meaningful only for smaps_rollup, otherwise two of
@@ -860,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_active(vma));
+ hugepage_vma_check(vma, vma->vm_flags, true, false));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1421,6 +1425,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
+ if (pte_marker_entry_uffd_wp(entry))
+ flags |= PM_UFFD_WP;
}
if (page && !PageAnon(page))
@@ -1556,10 +1562,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
if (page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
+ if (huge_pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
+
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pte_pfn(pte) +
((addr & ~hmask) >> PAGE_SHIFT);
+ } else if (pte_swp_uffd_wp_any(pte)) {
+ flags |= PM_UFFD_WP;
}
for (; addr != end; addr += PAGE_SIZE) {
@@ -1785,7 +1796,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return NULL;
page = vm_normal_page(vma, addr, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
return NULL;
if (PageReserved(page))
@@ -1873,8 +1884,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
return 0;
page = pte_page(huge_pte);
- if (!page)
- return 0;
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 6f1b8ddc6f7a..f2aa86c421f2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -25,7 +25,7 @@
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <linux/cc_platform.h>
#include <asm/io.h>
#include "internal.h"
@@ -128,9 +128,8 @@ static int open_vmcore(struct inode *inode, struct file *file)
}
/* Reads a page from the oldmem device from given offset. */
-ssize_t read_from_oldmem(char *buf, size_t count,
- u64 *ppos, int userbuf,
- bool encrypted)
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+ u64 *ppos, bool encrypted)
{
unsigned long pfn, offset;
size_t nr_bytes;
@@ -152,29 +151,23 @@ ssize_t read_from_oldmem(char *buf, size_t count,
/* If pfn is not ram, return zeros for sparse dump files */
if (!pfn_is_ram(pfn)) {
- tmp = 0;
- if (!userbuf)
- memset(buf, 0, nr_bytes);
- else if (clear_user(buf, nr_bytes))
- tmp = -EFAULT;
+ tmp = iov_iter_zero(nr_bytes, iter);
} else {
if (encrypted)
- tmp = copy_oldmem_page_encrypted(pfn, buf,
+ tmp = copy_oldmem_page_encrypted(iter, pfn,
nr_bytes,
- offset,
- userbuf);
+ offset);
else
- tmp = copy_oldmem_page(pfn, buf, nr_bytes,
- offset, userbuf);
+ tmp = copy_oldmem_page(iter, pfn, nr_bytes,
+ offset);
}
- if (tmp < 0) {
+ if (tmp < nr_bytes) {
srcu_read_unlock(&vmcore_cb_srcu, idx);
- return tmp;
+ return -EFAULT;
}
*ppos += nr_bytes;
count -= nr_bytes;
- buf += nr_bytes;
read += nr_bytes;
++pfn;
offset = 0;
@@ -203,7 +196,12 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, false);
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos, false);
}
/*
@@ -211,7 +209,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
}
/*
@@ -228,29 +232,14 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
/*
* Architectures which support memory encryption override this.
*/
-ssize_t __weak
-copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter,
+ unsigned long pfn, size_t csize, unsigned long offset)
{
- return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
-}
-
-/*
- * Copy to either kernel or user space
- */
-static int copy_to(void *target, void *src, size_t size, int userbuf)
-{
- if (userbuf) {
- if (copy_to_user((char __user *) target, src, size))
- return -EFAULT;
- } else {
- memcpy(target, src, size);
- }
- return 0;
+ return copy_oldmem_page(iter, pfn, csize, offset);
}
#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
-static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size)
{
struct vmcoredd_node *dump;
u64 offset = 0;
@@ -263,14 +252,13 @@ static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
- if (copy_to(dst, buf, tsz, userbuf)) {
+ if (copy_to_iter(buf, tsz, iter) < tsz) {
ret = -EFAULT;
goto out_unlock;
}
size -= tsz;
start += tsz;
- dst += tsz;
/* Leave now if buffer filled already */
if (!size)
@@ -326,33 +314,28 @@ out_unlock:
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
-static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
- int userbuf)
+static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
{
ssize_t acc = 0, tmp;
size_t tsz;
u64 start;
struct vmcore *m = NULL;
- if (buflen == 0 || *fpos >= vmcore_size)
+ if (!iov_iter_count(iter) || *fpos >= vmcore_size)
return 0;
- /* trim buflen to not go beyond EOF */
- if (buflen > vmcore_size - *fpos)
- buflen = vmcore_size - *fpos;
+ iov_iter_truncate(iter, vmcore_size - *fpos);
/* Read ELF core header */
if (*fpos < elfcorebuf_sz) {
- tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
- if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
+ tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter));
+ if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
@@ -373,35 +356,32 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
/* Read device dumps */
if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
- (size_t)*fpos, buflen);
+ (size_t)*fpos, iov_iter_count(iter));
start = *fpos - elfcorebuf_sz;
- if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+ if (vmcoredd_copy_dumps(iter, start, tsz))
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (!buflen)
+ if (!iov_iter_count(iter))
return acc;
}
#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
/* Read remaining elf notes */
- tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos,
+ iov_iter_count(iter));
kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
- if (copy_to(buffer, kaddr, tsz, userbuf))
+ if (copy_to_iter(kaddr, tsz, iter) < tsz)
return -EFAULT;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
@@ -409,19 +389,17 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
if (*fpos < m->offset + m->size) {
tsz = (size_t)min_t(unsigned long long,
m->offset + m->size - *fpos,
- buflen);
+ iov_iter_count(iter));
start = m->paddr + *fpos - m->offset;
- tmp = read_from_oldmem(buffer, tsz, &start,
- userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+ tmp = read_from_oldmem(iter, tsz, &start,
+ cc_platform_has(CC_ATTR_MEM_ENCRYPT));
if (tmp < 0)
return tmp;
- buflen -= tsz;
*fpos += tsz;
- buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
- if (buflen == 0)
+ if (!iov_iter_count(iter))
return acc;
}
}
@@ -429,15 +407,14 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
return acc;
}
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
- size_t buflen, loff_t *fpos)
+static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter)
{
- return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+ return __read_vmcore(iter, &iocb->ki_pos);
}
/*
* The vmcore fault handler uses the page cache and fills data using the
- * standard __vmcore_read() function.
+ * standard __read_vmcore() function.
*
* On s390 the fault handler is used for memory regions that can't be mapped
* directly with remap_pfn_range().
@@ -447,9 +424,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
#ifdef CONFIG_S390
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
pgoff_t index = vmf->pgoff;
+ struct iov_iter iter;
+ struct kvec kvec;
struct page *page;
loff_t offset;
- char *buf;
int rc;
page = find_or_create_page(mapping, index, GFP_KERNEL);
@@ -457,8 +435,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
if (!PageUptodate(page)) {
offset = (loff_t) index << PAGE_SHIFT;
- buf = __va((page_to_pfn(page) << PAGE_SHIFT));
- rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+ kvec.iov_base = page_address(page);
+ kvec.iov_len = PAGE_SIZE;
+ iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE);
+
+ rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
unlock_page(page);
put_page(page);
@@ -708,7 +689,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
static const struct proc_ops vmcore_proc_ops = {
.proc_open = open_vmcore,
- .proc_read = read_vmcore,
+ .proc_read_iter = read_vmcore,
.proc_lseek = default_llseek,
.proc_mmap = mmap_vmcore,
};
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 49650e54d2f8..846f9455ae22 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -86,7 +86,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
static inline void mangle(struct seq_file *m, const char *s)
{
- seq_escape(m, s, " \t\n\\");
+ seq_escape(m, s, " \t\n\\#");
}
static void show_type(struct seq_file *m, struct super_block *sb)
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 14658b009f1b..ffbadb8b3032 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -55,6 +55,7 @@ static void free_pstore_private(struct pstore_private *private)
return;
if (private->record) {
kfree(private->record->buf);
+ kfree(private->record->priv);
kfree(private->record);
}
kfree(private);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index e26162f102ff..0c034ea39954 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -769,6 +769,7 @@ void pstore_get_backend_records(struct pstore_info *psi,
if (rc) {
/* pstore_mkfile() did not take record, so free it. */
kfree(record->buf);
+ kfree(record->priv);
kfree(record);
if (rc != -EEXIST || !quiet)
failed++;
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 7c8f8feac6c3..017d0d4ad329 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -363,7 +363,7 @@ static int psz_kmsg_recover_data(struct psz_context *cxt)
rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf),
zone->off);
if (rcnt != zone->buffer_size + sizeof(*buf))
- return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ return rcnt < 0 ? rcnt : -EIO;
}
return 0;
}
@@ -372,7 +372,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt)
{
struct pstore_zone_info *info = cxt->pstore_zone_info;
struct pstore_zone *zone;
- size_t rcnt, len;
+ ssize_t rcnt, len;
struct psz_buffer *buf;
struct psz_kmsg_header *hdr;
struct timespec64 time = { };
@@ -400,7 +400,7 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt)
continue;
} else if (rcnt != len) {
pr_err("read %s with id %lu failed\n", zone->name, i);
- return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ return rcnt < 0 ? rcnt : -EIO;
}
if (buf->sig != zone->buffer->sig) {
@@ -502,7 +502,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone)
rcnt = info->read((char *)&tmpbuf, len, zone->off);
if (rcnt != len) {
pr_debug("read zone %s failed\n", zone->name);
- return (int)rcnt < 0 ? (int)rcnt : -EIO;
+ return rcnt < 0 ? rcnt : -EIO;
}
if (tmpbuf.sig != zone->buffer->sig) {
@@ -544,7 +544,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone)
rcnt = info->read(buf, len - start, off + start);
if (rcnt != len - start) {
pr_err("read zone %s failed\n", zone->name);
- ret = (int)rcnt < 0 ? (int)rcnt : -EIO;
+ ret = rcnt < 0 ? rcnt : -EIO;
goto free_oldbuf;
}
@@ -552,7 +552,7 @@ static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone)
rcnt = info->read(buf + len - start, start, off);
if (rcnt != start) {
pr_err("read zone %s failed\n", zone->name);
- ret = (int)rcnt < 0 ? (int)rcnt : -EIO;
+ ret = rcnt < 0 ? rcnt : -EIO;
goto free_oldbuf;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index a635bb6615e9..391ea402920d 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -245,17 +245,18 @@ static void qnx4_kill_sb(struct super_block *sb)
}
}
-static int qnx4_readpage(struct file *file, struct page *page)
+static int qnx4_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,qnx4_get_block);
+ return block_read_full_folio(folio, qnx4_get_block);
}
static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,qnx4_get_block);
}
+
static const struct address_space_operations qnx4_aops = {
- .readpage = qnx4_readpage,
+ .read_folio = qnx4_read_folio,
.bmap = qnx4_bmap
};
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 9d8e7e9788a1..b9895afca9d1 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -94,9 +94,9 @@ static int qnx6_check_blockptr(__fs32 ptr)
return 1;
}
-static int qnx6_readpage(struct file *file, struct page *page)
+static int qnx6_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, qnx6_get_block);
+ return mpage_read_folio(folio, qnx6_get_block);
}
static void qnx6_readahead(struct readahead_control *rac)
@@ -496,7 +496,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, qnx6_get_block);
}
static const struct address_space_operations qnx6_aops = {
- .readpage = qnx6_readpage,
+ .read_folio = qnx6_read_folio,
.readahead = qnx6_readahead,
.bmap = qnx6_bmap
};
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a74aef99bd3d..0427b44bfee5 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -79,6 +79,7 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
#include "../internal.h" /* ugh */
#include <linux/uaccess.h>
@@ -425,9 +426,11 @@ EXPORT_SYMBOL(mark_info_dirty);
int dquot_acquire(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
if (ret < 0)
@@ -458,6 +461,7 @@ int dquot_acquire(struct dquot *dquot)
smp_mb__before_atomic();
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -469,9 +473,11 @@ EXPORT_SYMBOL(dquot_acquire);
int dquot_commit(struct dquot *dquot)
{
int ret = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!clear_dquot_dirty(dquot))
goto out_lock;
/* Inactive dquot can be only if there was error during read/init
@@ -481,6 +487,7 @@ int dquot_commit(struct dquot *dquot)
else
ret = -EIO;
out_lock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -492,9 +499,11 @@ EXPORT_SYMBOL(dquot_commit);
int dquot_release(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
/* Check whether we are not racing with some other dqget() */
if (dquot_is_busy(dquot))
goto out_dqlock;
@@ -510,6 +519,7 @@ int dquot_release(struct dquot *dquot)
}
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -2075,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer);
/* Wrapper for transferring ownership of an inode for uid/gid only
* Called from FSXXX_setattr()
*/
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
+ struct iattr *iattr)
{
struct dquot *transfer_to[MAXQUOTAS] = {};
struct dquot *dquot;
@@ -2085,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
if (!dquot_active(inode))
return 0;
- if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
- dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
+ if (i_uid_needs_update(mnt_userns, iattr, inode)) {
+ kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsuid);
+
+ dquot = dqget(sb, make_kqid_uid(kuid));
if (IS_ERR(dquot)) {
if (PTR_ERR(dquot) != -ESRCH) {
ret = PTR_ERR(dquot);
@@ -2096,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
}
transfer_to[USRQUOTA] = dquot;
}
- if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
- dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
+ if (i_gid_needs_update(mnt_userns, iattr, inode)) {
+ kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsgid);
+
+ dquot = dqget(sb, make_kqid_gid(kgid));
if (IS_ERR(dquot)) {
if (PTR_ERR(dquot) != -ESRCH) {
ret = PTR_ERR(dquot);
@@ -2985,7 +3002,7 @@ static int __init dquot_init(void)
pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
- if (register_shrinker(&dqcache_shrinker))
+ if (register_shrinker(&dqcache_shrinker, "dquota-cache"))
panic("Cannot register dquot shrinker");
return 0;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 5f2405994280..0f1493e0f6d0 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -71,6 +71,40 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
return ret;
}
+static inline int do_check_range(struct super_block *sb, const char *val_name,
+ uint val, uint min_val, uint max_val)
+{
+ if (val < min_val || val > max_val) {
+ quota_error(sb, "Getting %s %u out of range %u-%u",
+ val_name, val, min_val, max_val);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
+ struct qt_disk_dqdbheader *dh)
+{
+ int err = 0;
+
+ err = do_check_range(info->dqi_sb, "dqdh_next_free",
+ le32_to_cpu(dh->dqdh_next_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_prev_free",
+ le32_to_cpu(dh->dqdh_prev_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_entries",
+ le16_to_cpu(dh->dqdh_entries), 0,
+ qtree_dqstr_in_blk(info));
+
+ return err;
+}
+
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
@@ -85,6 +119,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info)
ret = read_blk(info, blk, buf);
if (ret < 0)
goto out_buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
}
else {
@@ -232,6 +269,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
*err = read_blk(info, blk, buf);
if (*err < 0)
goto out_buf;
+ *err = check_dquot_block_header(info, dh);
+ if (*err)
+ goto out_buf;
} else {
blk = get_free_dqblk(info);
if ((int)blk < 0) {
@@ -313,6 +353,10 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
}
ref = (__le32 *)buf;
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ ret = do_check_range(dquot->dq_sb, "block", newblk, 0,
+ info->dqi_blocks - 1);
+ if (ret)
+ goto out_buf;
if (!newblk)
newson = 1;
if (depth == info->dqi_qtree_depth - 1) {
@@ -424,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
dh = (struct qt_disk_dqdbheader *)buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
le16_add_cpu(&dh->dqdh_entries, -1);
if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
ret = remove_free_dqentry(info, buf, blk);
@@ -480,12 +527,10 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
- if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
- quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
- newblk, info->dqi_blocks);
- ret = -EUCLEAN;
+ ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF,
+ info->dqi_blocks - 1);
+ if (ret)
goto out_buf;
- }
if (depth == info->dqi_qtree_depth - 1) {
ret = free_dqentry(info, dquot, newblk);
@@ -586,12 +631,10 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
if (!blk) /* No reference? */
goto out_buf;
- if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
- quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
- blk, info->dqi_blocks);
- ret = -EUCLEAN;
+ ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF,
+ info->dqi_blocks - 1);
+ if (ret)
goto out_buf;
- }
if (depth < info->dqi_qtree_depth - 1)
ret = find_tree_dqentry(info, dquot, blk, depth+1);
@@ -705,15 +748,21 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
goto out_buf;
}
for (i = __get_index(info, *id, depth); i < epb; i++) {
- if (ref[i] == cpu_to_le32(0)) {
+ uint blk_no = le32_to_cpu(ref[i]);
+
+ if (blk_no == 0) {
*id += level_inc;
continue;
}
+ ret = do_check_range(info->dqi_sb, "block", blk_no, 0,
+ info->dqi_blocks - 1);
+ if (ret)
+ goto out_buf;
if (depth == info->dqi_qtree_depth - 1) {
ret = 0;
goto out_buf;
}
- ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1);
+ ret = find_next_id(info, id, blk_no, depth + 1);
if (ret != -ENOENT)
break;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index e643aec2b0ef..328ce8cf9a85 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -227,12 +227,6 @@ loff_t noop_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(noop_llseek);
-loff_t no_llseek(struct file *file, loff_t offset, int whence)
-{
- return -ESPIPE;
-}
-EXPORT_SYMBOL(no_llseek);
-
loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file_inode(file);
@@ -290,14 +284,9 @@ EXPORT_SYMBOL(default_llseek);
loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
- loff_t (*fn)(struct file *, loff_t, int);
-
- fn = no_llseek;
- if (file->f_mode & FMODE_LSEEK) {
- if (file->f_op->llseek)
- fn = file->f_op->llseek;
- }
- return fn(file, offset, whence);
+ if (!(file->f_mode & FMODE_LSEEK))
+ return -ESPIPE;
+ return file->f_op->llseek(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);
@@ -389,14 +378,13 @@ EXPORT_SYMBOL(rw_verify_area);
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
- struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_init(&iter, READ, &iov, 1, len);
+ iov_iter_ubuf(&iter, READ, buf, len);
ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -492,14 +480,13 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
- struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_init(&iter, WRITE, &iov, 1, len);
+ iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len);
ret = call_write_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -509,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
}
/* caller is responsible for file_start_write/file_end_write */
-ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
{
- struct kvec iov = {
- .iov_base = (void *)buf,
- .iov_len = min_t(size_t, count, MAX_RW_COUNT),
- };
struct kiocb kiocb;
- struct iov_iter iter;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
@@ -532,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos ? *pos : 0;
- iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
- ret = file->f_op->write_iter(&kiocb, &iter);
+ ret = file->f_op->write_iter(&kiocb, from);
if (ret > 0) {
if (pos)
*pos = kiocb.ki_pos;
@@ -543,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
inc_syscw(current);
return ret;
}
+
+/* caller is responsible for file_start_write/file_end_write */
+ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+ struct kvec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = min_t(size_t, count, MAX_RW_COUNT),
+ };
+ struct iov_iter iter;
+ iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+ return __kernel_write_iter(file, &iter, pos);
+}
/*
* This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
* but autofs is one of the few internal kernel users that actually
@@ -682,6 +675,14 @@ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
return ksys_pread64(fd, buf, count, pos);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
+COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
+ size_t, count, compat_arg_u64_dual(pos))
+{
+ return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
+}
+#endif
+
ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos)
{
@@ -708,6 +709,14 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
return ksys_pwrite64(fd, buf, count, pos);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
+COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
+ size_t, count, compat_arg_u64_dual(pos))
+{
+ return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
+}
+#endif
+
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
loff_t *ppos, int type, rwf_t flags)
{
@@ -1247,6 +1256,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
count, fl);
file_end_write(out.file);
} else {
+ if (out.file->f_flags & O_NONBLOCK)
+ fl |= SPLICE_F_NONBLOCK;
+
retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
}
@@ -1381,28 +1393,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(generic_copy_file_range);
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- /*
- * Although we now allow filesystems to handle cross sb copy, passing
- * a file of the wrong filesystem type to filesystem driver can result
- * in an attempt to dereference the wrong type of ->private_data, so
- * avoid doing that until we really have a good reason. NFS defines
- * several different file_system_type structures, but they all end up
- * using the same ->copy_file_range() function pointer.
- */
- if (file_out->f_op->copy_file_range &&
- file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
- return file_out->f_op->copy_file_range(file_in, pos_in,
- file_out, pos_out,
- len, flags);
-
- return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
-}
-
/*
* Performs necessary checks before doing a file copy
*
@@ -1424,6 +1414,24 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
if (ret)
return ret;
+ /*
+ * We allow some filesystems to handle cross sb copy, but passing
+ * a file of the wrong filesystem type to filesystem driver can result
+ * in an attempt to dereference the wrong type of ->private_data, so
+ * avoid doing that until we really have a good reason.
+ *
+ * nfs and cifs define several different file_system_type structures
+ * and several different sets of file_operations, but they all end up
+ * using the same ->copy_file_range() function pointer.
+ */
+ if (file_out->f_op->copy_file_range) {
+ if (file_in->f_op->copy_file_range !=
+ file_out->f_op->copy_file_range)
+ return -EXDEV;
+ } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
+ return -EXDEV;
+ }
+
/* Don't touch certain kinds of inodes */
if (IS_IMMUTABLE(inode_out))
return -EPERM;
@@ -1489,26 +1497,41 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
file_start_write(file_out);
/*
- * Try cloning first, this is supported by more file systems, and
- * more efficient if both clone and copy are supported (e.g. NFS).
+ * Cloning is supported by more file systems, so we implement copy on
+ * same sb using clone, but for filesystems where both clone and copy
+ * are supported (e.g. nfs,cifs), we only call the copy method.
*/
+ if (file_out->f_op->copy_file_range) {
+ ret = file_out->f_op->copy_file_range(file_in, pos_in,
+ file_out, pos_out,
+ len, flags);
+ goto done;
+ }
+
if (file_in->f_op->remap_file_range &&
file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
- loff_t cloned;
-
- cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
min_t(loff_t, MAX_RW_COUNT, len),
REMAP_FILE_CAN_SHORTEN);
- if (cloned > 0) {
- ret = cloned;
+ if (ret > 0)
goto done;
- }
}
- ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
- WARN_ON_ONCE(ret == -EOPNOTSUPP);
+ /*
+ * We can get here for same sb copy of filesystems that do not implement
+ * ->copy_file_range() in case filesystem does not support clone or in
+ * case filesystem supports clone but rejected the clone request (e.g.
+ * because it was not block aligned).
+ *
+ * In both cases, fall back to kernel copy so we are able to maintain a
+ * consistent story about which filesystems support copy_file_range()
+ * and which filesystems do not, that will allow userspace tools to
+ * make consistent desicions w.r.t using copy_file_range().
+ */
+ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+ flags);
+
done:
if (ret > 0) {
fsnotify_access(file_in);
@@ -1633,7 +1656,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ if ((iocb->ki_flags & IOCB_NOWAIT) &&
+ !((iocb->ki_flags & IOCB_DIRECT) ||
+ (file->f_mode & FMODE_BUF_WASYNC)))
return -EINVAL;
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
diff --git a/fs/readdir.c b/fs/readdir.c
index 09e8ed7d4161..9c53edb60c03 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -140,7 +140,7 @@ struct readdir_callback {
int result;
};
-static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
+static bool fillonedir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct readdir_callback *buf =
@@ -149,14 +149,14 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
unsigned long d_ino;
if (buf->result)
- return -EINVAL;
+ return false;
buf->result = verify_dirent_name(name, namlen);
- if (buf->result < 0)
- return buf->result;
+ if (buf->result)
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
buf->result++;
dirent = buf->dirent;
@@ -169,12 +169,12 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_write_access_end();
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->result = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
@@ -219,7 +219,7 @@ struct getdents_callback {
int error;
};
-static int filldir(struct dir_context *ctx, const char *name, int namlen,
+static bool filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent __user *dirent, *prev;
@@ -232,18 +232,18 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->error = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -260,12 +260,12 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(getdents, unsigned int, fd,
@@ -307,7 +307,7 @@ struct getdents_callback64 {
int error;
};
-static int filldir64(struct dir_context *ctx, const char *name, int namlen,
+static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent64 __user *dirent, *prev;
@@ -319,13 +319,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *)dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -342,13 +342,13 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
SYSCALL_DEFINE3(getdents64, unsigned int, fd,
@@ -397,7 +397,7 @@ struct compat_readdir_callback {
int result;
};
-static int compat_fillonedir(struct dir_context *ctx, const char *name,
+static bool compat_fillonedir(struct dir_context *ctx, const char *name,
int namlen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -407,14 +407,14 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
compat_ulong_t d_ino;
if (buf->result)
- return -EINVAL;
+ return false;
buf->result = verify_dirent_name(name, namlen);
- if (buf->result < 0)
- return buf->result;
+ if (buf->result)
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
buf->result++;
dirent = buf->dirent;
@@ -427,12 +427,12 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_write_access_end();
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->result = -EFAULT;
- return -EFAULT;
+ return false;
}
COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
@@ -471,7 +471,7 @@ struct compat_getdents_callback {
int error;
};
-static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
+static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct compat_linux_dirent __user *dirent, *prev;
@@ -484,18 +484,18 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
- return buf->error;
+ return false;
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
- return -EINVAL;
+ return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->error = -EOVERFLOW;
- return -EOVERFLOW;
+ return false;
}
prev_reclen = buf->prev_reclen;
if (prev_reclen && signal_pending(current))
- return -EINTR;
+ return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
if (!user_write_access_begin(prev, reclen + prev_reclen))
@@ -511,12 +511,12 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
- return 0;
+ return true;
efault_end:
user_write_access_end();
efault:
buf->error = -EFAULT;
- return -EFAULT;
+ return false;
}
COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 203a47232707..6e228bfbe7ef 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -227,7 +227,7 @@ drop_write_lock:
}
/*
* If this is a partial write which happened to make all buffers
- * uptodate then we can optimize away a bogus readpage() for
+ * uptodate then we can optimize away a bogus read_folio() for
* the next read(). Here we 'discover' whether the page went
* uptodate as a result of this (potentially partial) write.
*/
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 36c59b25486c..b9580a6515ee 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -167,10 +167,10 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
* cutting the code is fine, since it really isn't in use yet and is easy
* to add back in. But, Vladimir has a really good idea here. Think
* about what happens for reading a file. For each page,
- * The VFS layer calls reiserfs_readpage, who searches the tree to find
+ * The VFS layer calls reiserfs_read_folio, who searches the tree to find
* an indirect item. This indirect item has X number of pointers, where
* X is a big number if we've done the block allocation right. But,
- * we only use one or two of these pointers during each call to readpage,
+ * we only use one or two of these pointers during each call to read_folio,
* needlessly researching again later on.
*
* The size of the cache could be dynamic based on the size of the file.
@@ -290,7 +290,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
struct buffer_head *bh;
struct item_head *ih, tmp_ih;
b_blocknr_t blocknr;
- char *p = NULL;
+ char *p;
int chars;
int ret;
int result;
@@ -305,8 +305,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
result = search_for_position_by_key(inode->i_sb, &key, &path);
if (result != POSITION_FOUND) {
pathrelse(&path);
- if (p)
- kunmap(bh_result->b_page);
if (result == IO_ERROR)
return -EIO;
/*
@@ -352,8 +350,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
}
pathrelse(&path);
- if (p)
- kunmap(bh_result->b_page);
return ret;
}
/* requested data are in direct item(s) */
@@ -363,8 +359,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
* when it is stored in direct item(s)
*/
pathrelse(&path);
- if (p)
- kunmap(bh_result->b_page);
return -ENOENT;
}
@@ -396,9 +390,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
* sure we need to. But, this means the item might move if
* kmap schedules
*/
- if (!p)
- p = (char *)kmap(bh_result->b_page);
-
+ p = (char *)kmap(bh_result->b_page);
p += offset;
memset(p, 0, inode->i_sb->s_blocksize);
do {
@@ -966,7 +958,7 @@ research:
* it is important the set_buffer_uptodate is done
* after the direct2indirect. The buffer might
* contain valid data newer than the data on disk
- * (read by readpage, changed, and then sent here by
+ * (read by read_folio, changed, and then sent here by
* writepage). direct2indirect needs to know if unbh
* was already up to date, so it can decide if the
* data in unbh needs to be replaced with data from
@@ -2664,7 +2656,7 @@ static int reiserfs_write_full_page(struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
nr++;
}
put_bh(bh);
@@ -2724,7 +2716,7 @@ fail:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
nr++;
}
put_bh(bh);
@@ -2733,9 +2725,9 @@ fail:
goto done;
}
-static int reiserfs_readpage(struct file *f, struct page *page)
+static int reiserfs_read_folio(struct file *f, struct folio *folio)
{
- return block_read_full_page(page, reiserfs_get_block);
+ return block_read_full_folio(folio, reiserfs_get_block);
}
static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -2753,7 +2745,7 @@ static void reiserfs_truncate_failed_write(struct inode *inode)
static int reiserfs_write_begin(struct file *file,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode;
@@ -2764,7 +2756,7 @@ static int reiserfs_write_begin(struct file *file,
inode = mapping->host;
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -3202,39 +3194,39 @@ static bool reiserfs_dirty_folio(struct address_space *mapping,
}
/*
- * Returns 1 if the page's buffers were dropped. The page is locked.
+ * Returns true if the folio's buffers were dropped. The folio is locked.
*
* Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
- * in the buffers at page_buffers(page).
+ * in the buffers at folio_buffers(folio).
*
* even in -o notail mode, we can't be sure an old mount without -o notail
* didn't create files with tails.
*/
-static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
struct buffer_head *head;
struct buffer_head *bh;
- int ret = 1;
+ bool ret = true;
- WARN_ON(PageChecked(page));
+ WARN_ON(folio_test_checked(folio));
spin_lock(&j->j_dirty_buffers_lock);
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
do {
if (bh->b_private) {
if (!buffer_dirty(bh) && !buffer_locked(bh)) {
reiserfs_free_jh(bh);
} else {
- ret = 0;
+ ret = false;
break;
}
}
bh = bh->b_this_page;
} while (bh != head);
if (ret)
- ret = try_to_free_buffers(page);
+ ret = try_to_free_buffers(folio);
spin_unlock(&j->j_dirty_buffers_lock);
return ret;
}
@@ -3284,7 +3276,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
/* must be turned off for recursive notify_change calls */
ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
@@ -3367,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
reiserfs_write_unlock(inode->i_sb);
if (error)
goto out;
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(mnt_userns, inode, attr);
reiserfs_write_lock(inode->i_sb);
if (error) {
journal_end(&th);
@@ -3421,9 +3413,9 @@ out:
const struct address_space_operations reiserfs_address_space_operations = {
.writepage = reiserfs_writepage,
- .readpage = reiserfs_readpage,
+ .read_folio = reiserfs_read_folio,
.readahead = reiserfs_readahead,
- .releasepage = reiserfs_releasepage,
+ .release_folio = reiserfs_release_folio,
.invalidate_folio = reiserfs_invalidate_folio,
.write_begin = reiserfs_write_begin,
.write_end = reiserfs_write_end,
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b5b6f6201bed..94addfcefede 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -601,14 +601,14 @@ static int journal_list_still_alive(struct super_block *s,
*/
static void release_buffer_page(struct buffer_head *bh)
{
- struct page *page = bh->b_page;
- if (!page->mapping && trylock_page(page)) {
- get_page(page);
+ struct folio *folio = page_folio(bh->b_page);
+ if (!folio->mapping && folio_trylock(folio)) {
+ folio_get(folio);
put_bh(bh);
- if (!page->mapping)
- try_to_free_buffers(page);
- unlock_page(page);
- put_page(page);
+ if (!folio->mapping)
+ try_to_free_buffers(folio);
+ folio_unlock(folio);
+ folio_put(folio);
} else {
put_bh(bh);
}
@@ -650,7 +650,7 @@ static void submit_logged_buffer(struct buffer_head *bh)
BUG();
if (!buffer_uptodate(bh))
BUG();
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
}
static void submit_ordered_buffer(struct buffer_head *bh)
@@ -660,7 +660,7 @@ static void submit_ordered_buffer(struct buffer_head *bh)
clear_buffer_dirty(bh);
if (!buffer_uptodate(bh))
BUG();
- submit_bh(REQ_OP_WRITE, 0, bh);
+ submit_bh(REQ_OP_WRITE, bh);
}
#define CHUNK_SIZE 32
@@ -868,7 +868,7 @@ loop_next:
*/
if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
spin_unlock(lock);
- ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
+ ll_rw_block(REQ_OP_WRITE, 1, &bh);
spin_lock(lock);
}
put_bh(bh);
@@ -1054,7 +1054,7 @@ static int flush_commit_list(struct super_block *s,
if (tbh) {
if (buffer_dirty(tbh)) {
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh);
+ ll_rw_block(REQ_OP_WRITE, 1, &tbh);
reiserfs_write_lock_nested(s, depth);
}
put_bh(tbh) ;
@@ -2240,7 +2240,7 @@ abort_replay:
}
}
/* read in the log blocks, memcpy to the corresponding real block */
- ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks);
+ ll_rw_block(REQ_OP_READ, get_desc_trans_len(desc), log_blocks);
for (i = 0; i < get_desc_trans_len(desc); i++) {
wait_on_buffer(log_blocks[i]);
@@ -2342,7 +2342,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
} else
bhlist[j++] = bh;
}
- ll_rw_block(REQ_OP_READ, 0, j, bhlist);
+ ll_rw_block(REQ_OP_READ, j, bhlist);
for (i = 1; i < j; i++)
brelse(bhlist[i]);
bh = bhlist[0];
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 30319dc33c18..84a194b77f19 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -456,7 +456,7 @@ static int print_internal(struct buffer_head *bh, int first, int last)
to = B_NR_ITEMS(bh);
} else {
from = first;
- to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh);
+ to = min_t(int, last, B_NR_ITEMS(bh));
}
reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 8096c74c38ac..7b498a0d060b 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -97,7 +97,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
* using the copy_size var below allows this code to work for
* both shrinking and expanding the FS.
*/
- copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
+ copy_size = min(bmap_nr_new, bmap_nr);
copy_size =
copy_size * sizeof(struct reiserfs_list_bitmap_node *);
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index ef42729216d1..9a293609a022 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -579,7 +579,7 @@ static int search_by_key_reada(struct super_block *s,
if (!buffer_uptodate(bh[j])) {
if (depth == -1)
depth = reiserfs_write_unlock_nested(s);
- ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j);
+ ll_rw_block(REQ_OP_READ | REQ_RAHEAD, 1, bh + j);
}
brelse(bh[j]);
}
@@ -685,7 +685,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,
if (!buffer_uptodate(bh) && depth == -1)
depth = reiserfs_write_unlock_nested(sb);
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
wait_on_buffer(bh);
if (depth != -1)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cfb7c44c7366..da1e72494e30 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1702,7 +1702,7 @@ static int read_super_block(struct super_block *s, int offset)
/* after journal replay, reread all bitmap and super blocks */
static int reread_meta_blocks(struct super_block *s)
{
- ll_rw_block(REQ_OP_READ, 0, 1, &SB_BUFFER_WITH_SB(s));
+ ll_rw_block(REQ_OP_READ, 1, &SB_BUFFER_WITH_SB(s));
wait_on_buffer(SB_BUFFER_WITH_SB(s));
if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
@@ -2504,9 +2504,7 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
len = i_size - off;
toread = len;
while (toread > 0) {
- tocopy =
- sb->s_blocksize - offset <
- toread ? sb->s_blocksize - offset : toread;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
/*
* Quota files are without tails so we can safely
@@ -2554,8 +2552,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
return -EIO;
}
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
reiserfs_write_lock(sb);
err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index bd073836e141..8b2d52443f41 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -189,7 +189,7 @@ struct reiserfs_dentry_buf {
struct dentry *dentries[8];
};
-static int
+static bool
fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
@@ -200,16 +200,16 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
- return -ENOSPC;
+ return false;
if (name[0] == '.' && (namelen < 2 ||
(namelen == 2 && name[1] == '.')))
- return 0;
+ return true;
dentry = lookup_one_len(name, dbuf->xadir, namelen);
if (IS_ERR(dentry)) {
dbuf->err = PTR_ERR(dentry);
- return PTR_ERR(dentry);
+ return false;
} else if (d_really_is_negative(dentry)) {
/* A directory entry exists, but no file? */
reiserfs_error(dentry->d_sb, "xattr-20003",
@@ -218,11 +218,11 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
dentry, dbuf->xadir);
dput(dentry);
dbuf->err = -EIO;
- return -EIO;
+ return false;
}
dbuf->dentries[dbuf->count++] = dentry;
- return 0;
+ return true;
}
static void
@@ -440,16 +440,9 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
*/
mapping_set_gfp_mask(mapping, GFP_NOFS);
page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
- if (!IS_ERR(page)) {
+ if (!IS_ERR(page))
kmap(page);
- if (PageError(page))
- goto fail;
- }
return page;
-
-fail:
- reiserfs_put_page(page);
- return ERR_PTR(-EIO);
}
static inline __u32 xattr_hash(const char *msg, int len)
@@ -804,7 +797,7 @@ struct listxattr_buf {
struct dentry *dentry;
};
-static int listxattr_filler(struct dir_context *ctx, const char *name,
+static bool listxattr_filler(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
@@ -820,19 +813,19 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
name);
if (!handler /* Unsupported xattr name */ ||
(handler->list && !handler->list(b->dentry)))
- return 0;
+ return true;
size = namelen + 1;
if (b->buf) {
if (b->pos + size > b->size) {
b->pos = -ERANGE;
- return -ERANGE;
+ return false;
}
memcpy(b->buf + b->pos, name, namelen);
b->buf[b->pos + namelen] = 0;
}
b->pos += size;
}
- return 0;
+ return true;
}
/*
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e112b5424cdb..654912d06862 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include "internal.h"
#include <linux/uaccess.h>
@@ -71,7 +72,8 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
* Otherwise, make sure the count is also block-aligned, having
* already confirmed the starting offsets' block alignment.
*/
- if (pos_in + count == size_in) {
+ if (pos_in + count == size_in &&
+ (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
bcount = ALIGN(size_in, bs) - pos_in;
} else {
if (!IS_ALIGNED(count, bs))
@@ -148,16 +150,7 @@ static int generic_remap_check_len(struct inode *inode_in,
/* Read a page's worth of file data into the page cache. */
static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
{
- struct folio *folio;
-
- folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
- if (IS_ERR(folio))
- return folio;
- if (!folio_test_uptodate(folio)) {
- folio_put(folio);
- return ERR_PTR(-EIO);
- }
- return folio;
+ return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
}
/*
@@ -271,9 +264,11 @@ out_error:
* If there's an error, then the usual negative error code is returned.
* Otherwise returns 0 with *len set to the request length.
*/
-int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
+int
+__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags,
+ const struct iomap_ops *dax_read_ops)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -333,8 +328,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- ret = vfs_dedupe_file_range_compare(file_in, pos_in,
- file_out, pos_out, *len, &is_same);
+ if (*len == 0)
+ return 0;
+
+ if (!IS_DAX(inode_in))
+ ret = vfs_dedupe_file_range_compare(file_in, pos_in,
+ file_out, pos_out, *len, &is_same);
+ else if (dax_read_ops)
+ ret = dax_dedupe_file_range_compare(inode_in, pos_in,
+ inode_out, pos_out, *len, &is_same,
+ dax_read_ops);
+ else
+ return -EINVAL;
if (ret)
return ret;
if (!is_same)
@@ -352,6 +357,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
return ret;
}
+
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ return __generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, NULL);
+}
EXPORT_SYMBOL(generic_remap_file_range_prep);
loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 9e6bbb4219de..c59b230d55b4 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -18,7 +18,7 @@
* Changed for 2.1.19 modules
* Jan 1997 Initial release
* Jun 1997 2.1.43+ changes
- * Proper page locking in readpage
+ * Proper page locking in read_folio
* Changed to work with 2.1.45+ fs
* Jul 1997 Fixed follow_link
* 2.1.47
@@ -41,7 +41,7 @@
* dentries in lookup
* clean up page flags setting
* (error, uptodate, locking) in
- * in readpage
+ * in read_folio
* use init_special_inode for
* fifos/sockets (and streamline) in
* read_inode, fix _ops table order
@@ -99,8 +99,9 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
/*
* read a page worth of data from the image
*/
-static int romfs_readpage(struct file *file, struct page *page)
+static int romfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
loff_t offset, size;
unsigned long fillsize, pos;
@@ -142,7 +143,7 @@ static int romfs_readpage(struct file *file, struct page *page)
}
static const struct address_space_operations romfs_aops = {
- .readpage = romfs_readpage
+ .read_folio = romfs_read_folio
};
/*
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7ab8a58c29b6..9456a2032224 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -931,6 +931,38 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
}
EXPORT_SYMBOL(seq_list_next);
+struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos)
+{
+ struct list_head *lh;
+
+ list_for_each_rcu(lh, head)
+ if (pos-- == 0)
+ return lh;
+
+ return NULL;
+}
+EXPORT_SYMBOL(seq_list_start_rcu);
+
+struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos)
+{
+ if (!pos)
+ return head;
+
+ return seq_list_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_list_start_head_rcu);
+
+struct list_head *seq_list_next_rcu(void *v, struct list_head *head,
+ loff_t *ppos)
+{
+ struct list_head *lh;
+
+ lh = list_next_rcu((struct list_head *)v);
+ ++*ppos;
+ return lh == head ? NULL : lh;
+}
+EXPORT_SYMBOL(seq_list_next_rcu);
+
/**
* seq_hlist_start - start an iteration of a hlist
* @head: the head of the hlist
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index 0507aecfc669..2cab413fffee 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -1244,6 +1244,106 @@ struct file_zero_data_information {
__le64 BeyondFinalZero;
} __packed;
+/* See MS-FSCC 2.3.7 */
+struct duplicate_extents_to_file {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+} __packed;
+
+/* See MS-FSCC 2.3.8 */
+#define DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC 0x00000001
+struct duplicate_extents_to_file_ex {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+ __le32 Flags;
+ __le32 Reserved;
+} __packed;
+
+
+/* See MS-FSCC 2.3.20 */
+struct fsctl_get_integrity_information_rsp {
+ __le16 ChecksumAlgorithm;
+ __le16 Reserved;
+ __le32 Flags;
+ __le32 ChecksumChunkSizeInBytes;
+ __le32 ClusterSizeInBytes;
+} __packed;
+
+/* See MS-FSCC 2.3.55 */
+struct fsctl_query_file_regions_req {
+ __le64 FileOffset;
+ __le64 Length;
+ __le32 DesiredUsage;
+ __le32 Reserved;
+} __packed;
+
+/* DesiredUsage flags see MS-FSCC 2.3.56.1 */
+#define FILE_USAGE_INVALID_RANGE 0x00000000
+#define FILE_USAGE_VALID_CACHED_DATA 0x00000001
+#define FILE_USAGE_NONCACHED_DATA 0x00000002
+
+struct file_region_info {
+ __le64 FileOffset;
+ __le64 Length;
+ __le32 DesiredUsage;
+ __le32 Reserved;
+} __packed;
+
+/* See MS-FSCC 2.3.56 */
+struct fsctl_query_file_region_rsp {
+ __le32 Flags;
+ __le32 TotalRegionEntryCount;
+ __le32 RegionEntryCount;
+ __u32 Reserved;
+ struct file_region_info Regions[];
+} __packed;
+
+/* See MS-FSCC 2.3.58 */
+struct fsctl_query_on_disk_vol_info_rsp {
+ __le64 DirectoryCount;
+ __le64 FileCount;
+ __le16 FsFormatMajVersion;
+ __le16 FsFormatMinVersion;
+ __u8 FsFormatName[24];
+ __le64 FormatTime;
+ __le64 LastUpdateTime;
+ __u8 CopyrightInfo[68];
+ __u8 AbstractInfo[68];
+ __u8 FormatImplInfo[68];
+ __u8 LastModifyImplInfo[68];
+} __packed;
+
+/* See MS-FSCC 2.3.73 */
+struct fsctl_set_integrity_information_req {
+ __le16 ChecksumAlgorithm;
+ __le16 Reserved;
+ __le32 Flags;
+} __packed;
+
+/* See MS-FSCC 2.3.75 */
+struct fsctl_set_integrity_info_ex_req {
+ __u8 EnableIntegrity;
+ __u8 KeepState;
+ __u16 Reserved;
+ __le32 Flags;
+ __u8 Version;
+ __u8 Reserved2[7];
+} __packed;
+
+/* Integrity ChecksumAlgorithm choices for above */
+#define CHECKSUM_TYPE_NONE 0x0000
+#define CHECKSUM_TYPE_CRC64 0x0002
+#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */
+
+/* Integrity flags for above */
+#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+
/* Reparse structures - see MS-FSCC 2.1.2 */
/* struct fsctl_reparse_info_req is empty, only response structs (see below) */
@@ -1304,13 +1404,6 @@ struct validate_negotiate_info_rsp {
__le16 Dialect; /* Dialect in use for the connection */
} __packed;
-struct duplicate_extents_to_file {
- __u64 PersistentFileHandle; /* source file handle, opaque endianness */
- __u64 VolatileFileHandle;
- __le64 SourceFileOffset;
- __le64 TargetFileOffset;
- __le64 ByteCount; /* Bytes to be copied */
-} __packed;
/* Possible InfoType values */
#define SMB2_O_INFO_FILE 0x01
@@ -1419,6 +1512,7 @@ struct smb2_query_info_rsp {
* PDU query infolevel structure definitions
*/
+/* See MS-FSCC 2.3.52 */
struct file_allocated_range_buffer {
__le64 file_offset;
__le64 length;
diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h
index d51939c43ad7..edd7fc2a7921 100644
--- a/fs/smbfs_common/smbfsctl.h
+++ b/fs/smbfs_common/smbfsctl.h
@@ -88,21 +88,27 @@
#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */
#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */
#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_MARK_HANDLE 0x000900FC /* BB add struct */
#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */
#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */
#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */
+#define FSCTL_QUERY_ON_DISK_VOLUME_INFO 0x0009013C
#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */
#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
+#define FSCTL_QUERY_FILE_REGIONS 0x00090284
#define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */
#define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380
#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3
#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b
#define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440
#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF
+#define FSCTL_OFFLOAD_READ 0x00094264 /* BB add struct */
+#define FSCTL_OFFLOAD_WRITE 0x00098268 /* BB add struct */
#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
+#define FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX 0x000983E8
#define FSCTL_SIS_LINK_FILES 0x0009C104
#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280
#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
diff --git a/fs/splice.c b/fs/splice.c
index 047b79db8eb5..0878b852b355 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -301,11 +301,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
{
struct iov_iter to;
struct kiocb kiocb;
- unsigned int i_head;
int ret;
iov_iter_pipe(&to, READ, pipe, len);
- i_head = to.head;
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -313,9 +311,8 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
- to.head = i_head;
- to.iov_offset = 0;
- iov_iter_advance(&to, 0); /* to free what was emitted */
+ /* free what was emitted */
+ pipe_discard_from(pipe, to.start_head);
/*
* callers of ->splice_read() expect -EAGAIN on
* "can't put anything in there", rather than -EFAULT.
@@ -814,17 +811,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
{
struct pipe_inode_info *pipe;
long ret, bytes;
- umode_t i_mode;
size_t len;
int i, flags, more;
/*
- * We require the input being a regular file, as we don't want to
- * randomly drop data for eg socket -> socket splicing. Use the
- * piped splicing for that!
+ * We require the input to be seekable, as we don't want to randomly
+ * drop data for eg socket -> socket splicing. Use the piped splicing
+ * for that!
*/
- i_mode = file_inode(in)->i_mode;
- if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+ if (unlikely(!(in->f_mode & FMODE_LSEEK)))
return -EINVAL;
/*
@@ -1163,39 +1158,40 @@ static int iter_to_pipe(struct iov_iter *from,
};
size_t total = 0;
int ret = 0;
- bool failed = false;
- while (iov_iter_count(from) && !failed) {
+ while (iov_iter_count(from)) {
struct page *pages[16];
- ssize_t copied;
+ ssize_t left;
size_t start;
- int n;
+ int i, n;
- copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
- if (copied <= 0) {
- ret = copied;
+ left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
+ if (left <= 0) {
+ ret = left;
break;
}
- for (n = 0; copied; n++, start = 0) {
- int size = min_t(int, copied, PAGE_SIZE - start);
- if (!failed) {
- buf.page = pages[n];
- buf.offset = start;
- buf.len = size;
- ret = add_to_pipe(pipe, &buf);
- if (unlikely(ret < 0)) {
- failed = true;
- } else {
- iov_iter_advance(from, ret);
- total += ret;
- }
- } else {
- put_page(pages[n]);
+ n = DIV_ROUND_UP(left + start, PAGE_SIZE);
+ for (i = 0; i < n; i++) {
+ int size = min_t(int, left, PAGE_SIZE - start);
+
+ buf.page = pages[i];
+ buf.offset = start;
+ buf.len = size;
+ ret = add_to_pipe(pipe, &buf);
+ if (unlikely(ret < 0)) {
+ iov_iter_revert(from, left);
+ // this one got dropped by add_to_pipe()
+ while (++i < n)
+ put_page(pages[i]);
+ goto out;
}
- copied -= size;
+ total += ret;
+ left -= size;
+ start = 0;
}
}
+out:
return total ? total : ret;
}
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 7bd9b8b856d0..477c89a519ee 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,9 +5,9 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o
squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
-squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 622c844f6d11..833aca92301f 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -34,12 +34,15 @@ static int copy_bio_to_actor(struct bio *bio,
struct squashfs_page_actor *actor,
int offset, int req_length)
{
- void *actor_addr = squashfs_first_page(actor);
+ void *actor_addr;
struct bvec_iter_all iter_all = {};
struct bio_vec *bvec = bvec_init_iter_all(&iter_all);
int copied_bytes = 0;
int actor_offset = 0;
+ squashfs_actor_nobuff(actor);
+ actor_addr = squashfs_first_page(actor);
+
if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all)))
return 0;
@@ -49,8 +52,9 @@ static int copy_bio_to_actor(struct bio *bio,
bytes_to_copy = min_t(int, bytes_to_copy,
req_length - copied_bytes);
- memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset,
- bytes_to_copy);
+ if (!IS_ERR(actor_addr))
+ memcpy(actor_addr + actor_offset, bvec_virt(bvec) +
+ offset, bytes_to_copy);
actor_offset += bytes_to_copy;
copied_bytes += bytes_to_copy;
@@ -86,17 +90,10 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
int error, i;
struct bio *bio;
- if (page_count <= BIO_MAX_VECS) {
- bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO);
- } else {
- bio = bio_kmalloc(GFP_NOIO, page_count);
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_opf = REQ_OP_READ;
- }
-
+ bio = bio_kmalloc(page_count, GFP_NOIO);
if (!bio)
return -ENOMEM;
-
+ bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ);
bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT);
for (i = 0; i < page_count; ++i) {
@@ -126,7 +123,8 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length,
out_free_bio:
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
return error;
}
@@ -190,7 +188,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
length |= data[0] << 8;
}
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
compressed = SQUASHFS_COMPRESSED(length);
length = SQUASHFS_COMPRESSED_SIZE(length);
@@ -224,7 +223,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
out_free_bio:
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
out:
if (res < 0) {
ERROR("Failed to read block 0x%llx: %d\n", index, res);
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 1b9ccfd0aa51..19ab60834389 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -20,6 +20,7 @@ struct squashfs_decompressor {
struct bio *, int, int, struct squashfs_page_actor *);
int id;
char *name;
+ int alloc_buffer;
int supported;
};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 89d492916dea..e56510964b22 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -39,6 +39,7 @@
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "page_actor.h"
/*
* Locate cache slot in range [offset, index] for specified inode. If
@@ -444,8 +445,9 @@ static int squashfs_readpage_sparse(struct page *page, int expected)
return 0;
}
-static int squashfs_readpage(struct file *file, struct page *page)
+static int squashfs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
int index = page->index >> (msblk->block_log - PAGE_SHIFT);
@@ -453,7 +455,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
int expected = index == file_end ?
(i_size_read(inode) & (msblk->block_size - 1)) :
msblk->block_size;
- int res;
+ int res = 0;
void *pageaddr;
TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
@@ -466,14 +468,15 @@ static int squashfs_readpage(struct file *file, struct page *page)
if (index < file_end || squashfs_i(inode)->fragment_block ==
SQUASHFS_INVALID_BLK) {
u64 block = 0;
- int bsize = read_blocklist(inode, index, &block);
- if (bsize < 0)
+
+ res = read_blocklist(inode, index, &block);
+ if (res < 0)
goto error_out;
- if (bsize == 0)
+ if (res == 0)
res = squashfs_readpage_sparse(page, expected);
else
- res = squashfs_readpage_block(page, block, bsize, expected);
+ res = squashfs_readpage_block(page, block, res, expected);
} else
res = squashfs_readpage_fragment(page, expected);
@@ -487,14 +490,144 @@ out:
memset(pageaddr, 0, PAGE_SIZE);
kunmap_atomic(pageaddr);
flush_dcache_page(page);
- if (!PageError(page))
+ if (res == 0)
SetPageUptodate(page);
unlock_page(page);
- return 0;
+ return res;
}
+static int squashfs_readahead_fragment(struct page **page,
+ unsigned int pages, unsigned int expected)
+{
+ struct inode *inode = page[0]->mapping->host;
+ struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+ squashfs_i(inode)->fragment_block,
+ squashfs_i(inode)->fragment_size);
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
+
+ if (buffer->error)
+ goto out;
+
+ expected += squashfs_i(inode)->fragment_offset;
+
+ for (n = 0; n < pages; n++) {
+ unsigned int base = (page[n]->index & mask) << PAGE_SHIFT;
+ unsigned int offset = base + squashfs_i(inode)->fragment_offset;
+
+ if (expected > offset) {
+ unsigned int avail = min_t(unsigned int, expected -
+ offset, PAGE_SIZE);
+
+ squashfs_fill_page(page[n], buffer, offset, avail);
+ }
+
+ unlock_page(page[n]);
+ put_page(page[n]);
+ }
+
+out:
+ squashfs_cache_put(buffer);
+ return buffer->error;
+}
+
+static void squashfs_readahead(struct readahead_control *ractl)
+{
+ struct inode *inode = ractl->mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ size_t mask = (1UL << msblk->block_log) - 1;
+ unsigned short shift = msblk->block_log - PAGE_SHIFT;
+ loff_t start = readahead_pos(ractl) & ~mask;
+ size_t len = readahead_length(ractl) + readahead_pos(ractl) - start;
+ struct squashfs_page_actor *actor;
+ unsigned int nr_pages = 0;
+ struct page **pages;
+ int i, file_end = i_size_read(inode) >> msblk->block_log;
+ unsigned int max_pages = 1UL << shift;
+
+ readahead_expand(ractl, start, (len | mask) + 1);
+
+ pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL);
+ if (!pages)
+ return;
+
+ for (;;) {
+ pgoff_t index;
+ int res, bsize;
+ u64 block = 0;
+ unsigned int expected;
+
+ nr_pages = __readahead_batch(ractl, pages, max_pages);
+ if (!nr_pages)
+ break;
+
+ if (readahead_pos(ractl) >= i_size_read(inode))
+ goto skip_pages;
+
+ index = pages[0]->index >> shift;
+ if ((pages[nr_pages - 1]->index >> shift) != index)
+ goto skip_pages;
+
+ expected = index == file_end ?
+ (i_size_read(inode) & (msblk->block_size - 1)) :
+ msblk->block_size;
+
+ if (index == file_end && squashfs_i(inode)->fragment_block !=
+ SQUASHFS_INVALID_BLK) {
+ res = squashfs_readahead_fragment(pages, nr_pages,
+ expected);
+ if (res)
+ goto skip_pages;
+ continue;
+ }
+
+ bsize = read_blocklist(inode, index, &block);
+ if (bsize == 0)
+ goto skip_pages;
+
+ actor = squashfs_page_actor_init_special(msblk, pages, nr_pages,
+ expected);
+ if (!actor)
+ goto skip_pages;
+
+ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+ squashfs_page_actor_free(actor);
+
+ if (res == expected) {
+ int bytes;
+
+ /* Last page (if present) may have trailing bytes not filled */
+ bytes = res % PAGE_SIZE;
+ if (pages[nr_pages - 1]->index == file_end && bytes)
+ memzero_page(pages[nr_pages - 1], bytes,
+ PAGE_SIZE - bytes);
+
+ for (i = 0; i < nr_pages; i++) {
+ flush_dcache_page(pages[i]);
+ SetPageUptodate(pages[i]);
+ }
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ }
+
+ kfree(pages);
+ return;
+
+skip_pages:
+ for (i = 0; i < nr_pages; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ kfree(pages);
+}
const struct address_space_operations squashfs_aops = {
- .readpage = squashfs_readpage
+ .read_folio = squashfs_read_folio,
+ .readahead = squashfs_readahead
};
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index a4894cc59447..f1ccad519e28 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -18,9 +18,6 @@
#include "squashfs.h"
#include "page_actor.h"
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
- int pages, struct page **page, int bytes);
-
/* Read separately compressed datablock directly into page cache */
int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int expected)
@@ -33,7 +30,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
int start_index = target_page->index & ~mask;
int end_index = start_index | mask;
- int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+ int i, n, pages, bytes, res = -ENOMEM;
struct page **page;
struct squashfs_page_actor *actor;
void *pageaddr;
@@ -47,50 +44,38 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
if (page == NULL)
return res;
- /*
- * Create a "page actor" which will kmap and kunmap the
- * page cache pages appropriately within the decompressor
- */
- actor = squashfs_page_actor_init_special(page, pages, 0);
- if (actor == NULL)
- goto out;
-
/* Try to grab all the pages covered by the Squashfs block */
- for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+ for (i = 0, n = start_index; n <= end_index; n++) {
page[i] = (n == target_page->index) ? target_page :
grab_cache_page_nowait(target_page->mapping, n);
- if (page[i] == NULL) {
- missing_pages++;
+ if (page[i] == NULL)
continue;
- }
if (PageUptodate(page[i])) {
unlock_page(page[i]);
put_page(page[i]);
- page[i] = NULL;
- missing_pages++;
+ continue;
}
+
+ i++;
}
- if (missing_pages) {
- /*
- * Couldn't get one or more pages, this page has either
- * been VM reclaimed, but others are still in the page cache
- * and uptodate, or we're racing with another thread in
- * squashfs_readpage also trying to grab them. Fall back to
- * using an intermediate buffer.
- */
- res = squashfs_read_cache(target_page, block, bsize, pages,
- page, expected);
- if (res < 0)
- goto mark_errored;
+ pages = i;
+ /*
+ * Create a "page actor" which will kmap and kunmap the
+ * page cache pages appropriately within the decompressor
+ */
+ actor = squashfs_page_actor_init_special(msblk, page, pages, expected);
+ if (actor == NULL)
goto out;
- }
/* Decompress directly into the page cache buffers */
res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+
+ squashfs_page_actor_free(actor);
+
if (res < 0)
goto mark_errored;
@@ -99,12 +84,12 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
goto mark_errored;
}
- /* Last page may have trailing bytes not filled */
+ /* Last page (if present) may have trailing bytes not filled */
bytes = res % PAGE_SIZE;
- if (bytes) {
- pageaddr = kmap_atomic(page[pages - 1]);
+ if (page[pages - 1]->index == end_index && bytes) {
+ pageaddr = kmap_local_page(page[pages - 1]);
memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
- kunmap_atomic(pageaddr);
+ kunmap_local(pageaddr);
}
/* Mark pages as uptodate, unlock and release */
@@ -116,7 +101,6 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
put_page(page[i]);
}
- kfree(actor);
kfree(page);
return 0;
@@ -135,40 +119,6 @@ mark_errored:
}
out:
- kfree(actor);
kfree(page);
return res;
}
-
-
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
- int pages, struct page **page, int bytes)
-{
- struct inode *i = target_page->mapping->host;
- struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
- block, bsize);
- int res = buffer->error, n, offset = 0;
-
- if (res) {
- ERROR("Unable to read page, block %llx, size %x\n", block,
- bsize);
- goto out;
- }
-
- for (n = 0; n < pages && bytes > 0; n++,
- bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
- int avail = min_t(int, bytes, PAGE_SIZE);
-
- if (page[n] == NULL)
- continue;
-
- squashfs_fill_page(page[n], buffer, offset, avail);
- unlock_page(page[n]);
- if (page[n] != target_page)
- put_page(page[n]);
- }
-
-out:
- squashfs_cache_put(buffer);
- return res;
-}
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index b685b6238316..49797729f143 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -119,10 +119,12 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
buff = stream->output;
while (data) {
if (bytes <= PAGE_SIZE) {
- memcpy(data, buff, bytes);
+ if (!IS_ERR(data))
+ memcpy(data, buff, bytes);
break;
}
- memcpy(data, buff, PAGE_SIZE);
+ if (!IS_ERR(data))
+ memcpy(data, buff, PAGE_SIZE);
buff += PAGE_SIZE;
bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
@@ -139,5 +141,6 @@ const struct squashfs_decompressor squashfs_lz4_comp_ops = {
.decompress = lz4_uncompress,
.id = LZ4_COMPRESSION,
.name = "lz4",
+ .alloc_buffer = 0,
.supported = 1
};
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index cb510a631968..d216aeefa865 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -93,10 +93,12 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
buff = stream->output;
while (data) {
if (bytes <= PAGE_SIZE) {
- memcpy(data, buff, bytes);
+ if (!IS_ERR(data))
+ memcpy(data, buff, bytes);
break;
} else {
- memcpy(data, buff, PAGE_SIZE);
+ if (!IS_ERR(data))
+ memcpy(data, buff, PAGE_SIZE);
buff += PAGE_SIZE;
bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
@@ -116,5 +118,6 @@ const struct squashfs_decompressor squashfs_lzo_comp_ops = {
.decompress = lzo_uncompress,
.id = LZO_COMPRESSION,
.name = "lzo",
+ .alloc_buffer = 0,
.supported = 1
};
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 520d323a99ce..54b93bf4a25c 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -7,6 +7,8 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
#include "page_actor.h"
/*
@@ -50,6 +52,7 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
actor->buffer = buffer;
actor->pages = pages;
actor->next_page = 0;
+ actor->tmp_buffer = NULL;
actor->squashfs_first_page = cache_first_page;
actor->squashfs_next_page = cache_next_page;
actor->squashfs_finish_page = cache_finish_page;
@@ -57,40 +60,72 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
}
/* Implementation of page_actor for decompressing directly into page cache. */
+static void *handle_next_page(struct squashfs_page_actor *actor)
+{
+ int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ if (actor->returned_pages == max_pages)
+ return NULL;
+
+ if ((actor->next_page == actor->pages) ||
+ (actor->next_index != actor->page[actor->next_page]->index)) {
+ actor->next_index++;
+ actor->returned_pages++;
+ return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM);
+ }
+
+ actor->next_index++;
+ actor->returned_pages++;
+ return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]);
+}
+
static void *direct_first_page(struct squashfs_page_actor *actor)
{
- actor->next_page = 1;
- return actor->pageaddr = kmap_atomic(actor->page[0]);
+ return handle_next_page(actor);
}
static void *direct_next_page(struct squashfs_page_actor *actor)
{
- if (actor->pageaddr)
- kunmap_atomic(actor->pageaddr);
+ if (actor->pageaddr) {
+ kunmap_local(actor->pageaddr);
+ actor->pageaddr = NULL;
+ }
- return actor->pageaddr = actor->next_page == actor->pages ? NULL :
- kmap_atomic(actor->page[actor->next_page++]);
+ return handle_next_page(actor);
}
static void direct_finish_page(struct squashfs_page_actor *actor)
{
if (actor->pageaddr)
- kunmap_atomic(actor->pageaddr);
+ kunmap_local(actor->pageaddr);
}
-struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
- int pages, int length)
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk,
+ struct page **page, int pages, int length)
{
struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
if (actor == NULL)
return NULL;
+ if (msblk->decompressor->alloc_buffer) {
+ actor->tmp_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+ if (actor->tmp_buffer == NULL) {
+ kfree(actor);
+ return NULL;
+ }
+ } else
+ actor->tmp_buffer = NULL;
+
actor->length = length ? : pages * PAGE_SIZE;
actor->page = page;
actor->pages = pages;
actor->next_page = 0;
+ actor->returned_pages = 0;
+ actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1);
actor->pageaddr = NULL;
+ actor->alloc_buffer = msblk->decompressor->alloc_buffer;
actor->squashfs_first_page = direct_first_page;
actor->squashfs_next_page = direct_next_page;
actor->squashfs_finish_page = direct_finish_page;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 2e3073ace009..95ffbb543d91 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -6,63 +6,34 @@
* Phillip Lougher <phillip@squashfs.org.uk>
*/
-#ifndef CONFIG_SQUASHFS_FILE_DIRECT
-struct squashfs_page_actor {
- void **page;
- int pages;
- int length;
- int next_page;
-};
-
-static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
- int pages, int length)
-{
- struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
- if (actor == NULL)
- return NULL;
-
- actor->length = length ? : pages * PAGE_SIZE;
- actor->page = page;
- actor->pages = pages;
- actor->next_page = 0;
- return actor;
-}
-
-static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
-{
- actor->next_page = 1;
- return actor->page[0];
-}
-
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
- return actor->next_page == actor->pages ? NULL :
- actor->page[actor->next_page++];
-}
-
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
- /* empty */
-}
-#else
struct squashfs_page_actor {
union {
void **buffer;
struct page **page;
};
void *pageaddr;
+ void *tmp_buffer;
void *(*squashfs_first_page)(struct squashfs_page_actor *);
void *(*squashfs_next_page)(struct squashfs_page_actor *);
void (*squashfs_finish_page)(struct squashfs_page_actor *);
int pages;
int length;
int next_page;
+ int alloc_buffer;
+ int returned_pages;
+ pgoff_t next_index;
};
-extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
-extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
- **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+ int pages, int length);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(
+ struct squashfs_sb_info *msblk,
+ struct page **page, int pages, int length);
+static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor)
+{
+ kfree(actor->tmp_buffer);
+ kfree(actor);
+}
static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
{
return actor->squashfs_first_page(actor);
@@ -75,5 +46,8 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
{
actor->squashfs_finish_page(actor);
}
-#endif
+static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor)
+{
+ actor->alloc_buffer = 0;
+}
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 4f74abbc1a54..32565dafa7f3 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -29,7 +29,6 @@
#include <linux/module.h>
#include <linux/magic.h>
#include <linux/xattr.h>
-#include <linux/backing-dev.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
return decompressor;
}
-static int squashfs_bdi_init(struct super_block *sb)
-{
- int err;
- unsigned int major = MAJOR(sb->s_dev);
- unsigned int minor = MINOR(sb->s_dev);
-
- bdi_put(sb->s_bdi);
- sb->s_bdi = &noop_backing_dev_info;
-
- err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor);
- if (err)
- return err;
-
- sb->s_bdi->ra_pages = 0;
- sb->s_bdi->io_pages = 0;
-
- return 0;
-}
static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
@@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
TRACE("Entered squashfs_fill_superblock\n");
- /*
- * squashfs provides 'backing_dev_info' in order to disable read-ahead. For
- * squashfs, I/O is not deferred, it is done immediately in readpage,
- * which means the user would always have to wait their own I/O. So the effect
- * of readahead is very weak for squashfs. squashfs_bdi_init will set
- * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for
- * squashfs.
- */
- err = squashfs_bdi_init(sb);
- if (err) {
- errorf(fc, "squashfs init bdi failed");
- return err;
- }
-
sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
if (sb->s_fs_info == NULL) {
ERROR("Failed to allocate squashfs_sb_info\n");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 1430613183e6..2bf977a52c2c 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -30,8 +30,9 @@
#include "squashfs.h"
#include "xattr.h"
-static int squashfs_symlink_readpage(struct file *file, struct page *page)
+static int squashfs_symlink_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct squashfs_sb_info *msblk = sb->s_fs_info;
@@ -101,7 +102,7 @@ error_out:
const struct address_space_operations squashfs_symlink_aops = {
- .readpage = squashfs_symlink_readpage
+ .read_folio = squashfs_symlink_read_folio
};
const struct inode_operations squashfs_symlink_inode_ops = {
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 68f6d09bb3a2..6c49481a2f8c 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -131,6 +131,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->buf.out_pos = 0;
stream->buf.out_size = PAGE_SIZE;
stream->buf.out = squashfs_first_page(output);
+ if (IS_ERR(stream->buf.out)) {
+ error = PTR_ERR(stream->buf.out);
+ goto finish;
+ }
for (;;) {
enum xz_ret xz_err;
@@ -156,7 +160,10 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->buf.out_pos == stream->buf.out_size) {
stream->buf.out = squashfs_next_page(output);
- if (stream->buf.out != NULL) {
+ if (IS_ERR(stream->buf.out)) {
+ error = PTR_ERR(stream->buf.out);
+ break;
+ } else if (stream->buf.out != NULL) {
stream->buf.out_pos = 0;
total += PAGE_SIZE;
}
@@ -171,6 +178,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
}
+finish:
squashfs_finish_page(output);
return error ? error : total + stream->buf.out_pos;
@@ -183,5 +191,6 @@ const struct squashfs_decompressor squashfs_xz_comp_ops = {
.decompress = squashfs_xz_uncompress,
.id = XZ_COMPRESSION,
.name = "xz",
+ .alloc_buffer = 1,
.supported = 1
};
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index a20e9042146b..cbb7afe7bc46 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -62,6 +62,11 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->next_out = squashfs_first_page(output);
stream->avail_in = 0;
+ if (IS_ERR(stream->next_out)) {
+ error = PTR_ERR(stream->next_out);
+ goto finish;
+ }
+
for (;;) {
int zlib_err;
@@ -85,7 +90,10 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->avail_out == 0) {
stream->next_out = squashfs_next_page(output);
- if (stream->next_out != NULL)
+ if (IS_ERR(stream->next_out)) {
+ error = PTR_ERR(stream->next_out);
+ break;
+ } else if (stream->next_out != NULL)
stream->avail_out = PAGE_SIZE;
}
@@ -107,6 +115,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
}
+finish:
squashfs_finish_page(output);
if (!error)
@@ -122,6 +131,7 @@ const struct squashfs_decompressor squashfs_zlib_comp_ops = {
.decompress = zlib_uncompress,
.id = ZLIB_COMPRESSION,
.name = "zlib",
+ .alloc_buffer = 1,
.supported = 1
};
diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c
index c40445dbf38c..0e407c4d8b3b 100644
--- a/fs/squashfs/zstd_wrapper.c
+++ b/fs/squashfs/zstd_wrapper.c
@@ -80,6 +80,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
out_buf.size = PAGE_SIZE;
out_buf.dst = squashfs_first_page(output);
+ if (IS_ERR(out_buf.dst)) {
+ error = PTR_ERR(out_buf.dst);
+ goto finish;
+ }
for (;;) {
size_t zstd_err;
@@ -104,7 +108,10 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (out_buf.pos == out_buf.size) {
out_buf.dst = squashfs_next_page(output);
- if (out_buf.dst == NULL) {
+ if (IS_ERR(out_buf.dst)) {
+ error = PTR_ERR(out_buf.dst);
+ break;
+ } else if (out_buf.dst == NULL) {
/* Shouldn't run out of pages
* before stream is done.
*/
@@ -129,6 +136,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm,
}
}
+finish:
+
squashfs_finish_page(output);
return error ? error : total_out;
@@ -140,5 +149,6 @@ const struct squashfs_decompressor squashfs_zstd_comp_ops = {
.decompress = zstd_uncompress,
.id = ZSTD_COMPRESSION,
.name = "zstd",
+ .alloc_buffer = 1,
.supported = 1
};
diff --git a/fs/stat.c b/fs/stat.c
index 7f734be0e57e..ef50573c72a2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -5,6 +5,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
*/
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
@@ -230,11 +231,22 @@ retry:
goto out;
error = vfs_getattr(&path, stat, request_mask, flags);
+
stat->mnt_id = real_mount(path.mnt)->mnt_id;
stat->result_mask |= STATX_MNT_ID;
+
if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+ /* Handle STATX_DIOALIGN for block devices. */
+ if (request_mask & STATX_DIOALIGN) {
+ struct inode *inode = d_backing_inode(path.dentry);
+
+ if (S_ISBLK(inode->i_mode))
+ bdev_statx_dioalign(inode, stat);
+ }
+
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -348,9 +360,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
@@ -359,7 +368,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
struct stat tmp;
- if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
@@ -367,7 +378,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
#endif
INIT_STRUCT_STAT_PADDING(tmp);
- tmp.st_dev = encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -377,7 +388,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
@@ -612,6 +623,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dev_major = MAJOR(stat->dev);
tmp.stx_dev_minor = MINOR(stat->dev);
tmp.stx_mnt_id = stat->mnt_id;
+ tmp.stx_dio_mem_align = stat->dio_mem_align;
+ tmp.stx_dio_offset_align = stat->dio_offset_align;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
@@ -660,16 +673,18 @@ SYSCALL_DEFINE5(statx,
return ret;
}
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT)
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
struct compat_stat tmp;
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -679,7 +694,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
tmp.st_size = stat->size;
diff --git a/fs/super.c b/fs/super.c
index f1d4a193602d..6a82660e1adb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -265,7 +265,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
- if (prealloc_shrinker(&s->s_shrink))
+ if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name))
goto fail;
if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
goto fail;
@@ -291,7 +291,6 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
security_sb_free(s);
- fscrypt_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
@@ -423,6 +422,35 @@ bool trylock_super(struct super_block *sb)
}
/**
+ * retire_super - prevents superblock from being reused
+ * @sb: superblock to retire
+ *
+ * The function marks superblock to be ignored in superblock test, which
+ * prevents it from being reused for any new mounts. If the superblock has
+ * a private bdi, it also unregisters it, but doesn't reduce the refcount
+ * of the superblock to prevent potential races. The refcount is reduced
+ * by generic_shutdown_super(). The function can not be called
+ * concurrently with generic_shutdown_super(). It is safe to call the
+ * function multiple times, subsequent calls have no effect.
+ *
+ * The marker will affect the re-use only for block-device-based
+ * superblocks. Other superblocks will still get marked if this function
+ * is used, but that will not affect their reusability.
+ */
+void retire_super(struct super_block *sb)
+{
+ WARN_ON(!sb->s_bdev);
+ down_write(&sb->s_umount);
+ if (sb->s_iflags & SB_I_PERSB_BDI) {
+ bdi_unregister(sb->s_bdi);
+ sb->s_iflags &= ~SB_I_PERSB_BDI;
+ }
+ sb->s_iflags |= SB_I_RETIRED;
+ up_write(&sb->s_umount);
+}
+EXPORT_SYMBOL(retire_super);
+
+/**
* generic_shutdown_super - common helper for ->kill_sb()
* @sb: superblock to kill
*
@@ -451,6 +479,7 @@ void generic_shutdown_super(struct super_block *sb)
evict_inodes(sb);
/* only nonzero refcount inodes can have marks */
fsnotify_sb_delete(sb);
+ fscrypt_sb_delete(sb);
security_sb_delete(sb);
if (sb->s_dio_done_wq) {
@@ -1204,7 +1233,7 @@ static int set_bdev_super(struct super_block *s, void *data)
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
- if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
+ if (bdev_stable_writes(s->s_bdev))
s->s_iflags |= SB_I_STABLE_WRITES;
return 0;
}
@@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
- return s->s_bdev == fc->sget_key;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key;
}
/**
@@ -1288,6 +1317,8 @@ int get_tree_bdev(struct fs_context *fc,
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+ fc->fs_type->name, s->s_id);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, fc);
if (error) {
@@ -1307,7 +1338,7 @@ EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
{
- return (void *)s->s_bdev == data;
+ return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data;
}
struct dentry *mount_bdev(struct file_system_type *fs_type,
@@ -1363,6 +1394,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
+ fs_type->name, s->s_id);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
diff --git a/fs/sync.c b/fs/sync.c
index c7690016453e..dc725914e1ed 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -373,6 +373,15 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
return ksys_sync_file_range(fd, offset, nbytes, flags);
}
+#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE)
+COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset),
+ compat_arg_u64_dual(nbytes), unsigned int, flags)
+{
+ return ksys_sync_file_range(fd, compat_arg_u64_glue(offset),
+ compat_arg_u64_glue(nbytes), flags);
+}
+#endif
+
/* It would be nice if people remember that not all the world's an i386
when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 42dcf96881b6..a12ac0356c69 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -703,19 +703,6 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
ktype = get_ktype(kobj);
if (ktype) {
- struct attribute **kattr;
-
- /*
- * Change owner of the default attributes associated with the
- * ktype of @kobj.
- */
- for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) {
- error = sysfs_file_change_owner(kobj, (*kattr)->name,
- kuid, kgid);
- if (error)
- return error;
- }
-
/*
* Change owner of the default groups associated with the
* ktype of @kobj.
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 409ab5e17803..d4ec9bb97de9 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -456,9 +456,9 @@ static int sysv_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page,get_block,wbc);
}
-static int sysv_readpage(struct file *file, struct page *page)
+static int sysv_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,get_block);
+ return block_read_full_folio(folio, get_block);
}
int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
@@ -477,12 +477,12 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
}
static int sysv_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, get_block);
if (unlikely(ret))
sysv_write_failed(mapping, pos + len);
@@ -497,7 +497,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations sysv_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = sysv_readpage,
+ .read_folio = sysv_read_folio,
.writepage = sysv_writepage,
.write_begin = sysv_write_begin,
.write_end = generic_write_end,
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index d1def0771a40..3365a30dc1e0 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -312,7 +312,9 @@ static int complete_read_super(struct super_block *sb, int silent, int size)
sbi->s_firstinodezone = 2;
flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
-
+ if (sbi->s_firstdatazone < sbi->s_firstinodezone)
+ return 0;
+
sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
sbi->s_inodes_per_block = bsize >> 6;
sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index de7252715b12..da85b3979195 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -141,6 +141,8 @@ struct tracefs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -241,6 +243,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = TRACEFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -275,24 +278,36 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
* but traditionally tracefs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int tracefs_apply_options(struct super_block *sb)
+static int tracefs_apply_options(struct super_block *sb, bool remount)
{
struct tracefs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct tracefs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
+
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- inode->i_uid = opts->uid;
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
+ if (!remount || opts->opts & BIT(Opt_gid)) {
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
+ }
return 0;
}
@@ -307,7 +322,7 @@ static int tracefs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto fail;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, true);
fail:
return err;
@@ -359,7 +374,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &tracefs_super_operations;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, false);
return 0;
@@ -553,7 +568,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
*
* Only one instances directory is allowed.
*
- * The instances directory is special as it allows for mkdir and rmdir to
+ * The instances directory is special as it allows for mkdir and rmdir
* to be done by userspace. When a mkdir or rmdir is performed, the inode
* locks are released and the methods passed in (@mkdir and @rmdir) are
* called without locks and with the name of the directory being created
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index c0b84e960b20..e8b9b756f0ac 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -65,7 +65,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write)
*/
static int run_gc(struct ubifs_info *c)
{
- int err, lnum;
+ int lnum;
/* Make some free space by garbage-collecting dirty space */
down_read(&c->commit_sem);
@@ -76,10 +76,7 @@ static int run_gc(struct ubifs_info *c)
/* GC freed one LEB, return it to lprops */
dbg_budg("GC freed LEB %d", lnum);
- err = ubifs_return_leb(c, lnum);
- if (err)
- return err;
- return 0;
+ return ubifs_return_leb(c, lnum);
}
/**
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0383fbdc95ff..f2353dd676ef 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -31,9 +31,9 @@
* in the "sys_write -> alloc_pages -> direct reclaim path". So, in
* 'ubifs_writepage()' we are only guaranteed that the page is locked.
*
- * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
+ * Similarly, @i_mutex is not always locked in 'ubifs_read_folio()', e.g., the
* read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
- * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not
+ * ondemand_readahead -> read_folio"). In case of readahead, @I_SYNC flag is not
* set as well. However, UBIFS disables readahead.
*/
@@ -215,8 +215,7 @@ static void release_existing_page_budget(struct ubifs_info *c)
}
static int write_begin_slow(struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep,
- unsigned flags)
+ loff_t pos, unsigned len, struct page **pagep)
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -244,7 +243,7 @@ static int write_begin_slow(struct address_space *mapping,
if (unlikely(err))
return err;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (unlikely(!page)) {
ubifs_release_budget(c, &req);
return -ENOMEM;
@@ -419,7 +418,7 @@ static int allocate_budget(struct ubifs_info *c, struct page *page,
* without forcing write-back. The slow path does not make this assumption.
*/
static int ubifs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -437,7 +436,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
return -EROFS;
/* Try out the fast-path part first */
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (unlikely(!page))
return -ENOMEM;
@@ -493,7 +492,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
put_page(page);
- return write_begin_slow(mapping, pos, len, pagep, flags);
+ return write_begin_slow(mapping, pos, len, pagep);
}
/*
@@ -890,12 +889,14 @@ out_unlock:
return err;
}
-static int ubifs_readpage(struct file *file, struct page *page)
+static int ubifs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
+
if (ubifs_bulk_read(page))
return 0;
do_readpage(page);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
}
@@ -1460,45 +1461,22 @@ static bool ubifs_dirty_folio(struct address_space *mapping,
return ret;
}
-#ifdef CONFIG_MIGRATION
-static int ubifs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
-{
- int rc;
-
- rc = migrate_page_move_mapping(mapping, newpage, page, 0);
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- if (PagePrivate(page)) {
- detach_page_private(page);
- attach_page_private(newpage, (void *)1);
- }
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
- else
- migrate_page_states(newpage, page);
- return MIGRATEPAGE_SUCCESS;
-}
-#endif
-
-static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
/*
* An attempt to release a dirty page without budgeting for it - should
* not happen.
*/
- if (PageWriteback(page))
- return 0;
- ubifs_assert(c, PagePrivate(page));
+ if (folio_test_writeback(folio))
+ return false;
+ ubifs_assert(c, folio_test_private(folio));
ubifs_assert(c, 0);
- detach_page_private(page);
- ClearPageChecked(page);
- return 1;
+ folio_detach_private(folio);
+ folio_clear_checked(folio);
+ return true;
}
/*
@@ -1642,16 +1620,14 @@ static int ubifs_symlink_getattr(struct user_namespace *mnt_userns,
}
const struct address_space_operations ubifs_file_address_operations = {
- .readpage = ubifs_readpage,
+ .read_folio = ubifs_read_folio,
.writepage = ubifs_writepage,
.write_begin = ubifs_write_begin,
.write_end = ubifs_write_end,
.invalidate_folio = ubifs_invalidate_folio,
.dirty_folio = ubifs_dirty_folio,
-#ifdef CONFIG_MIGRATION
- .migratepage = ubifs_migrate_page,
-#endif
- .releasepage = ubifs_releasepage,
+ .migrate_folio = filemap_migrate_folio,
+ .release_folio = ubifs_release_folio,
};
const struct inode_operations ubifs_file_inode_operations = {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index bad67455215f..d0c9a09988bc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2191,7 +2191,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
- * UBIFS, I/O is not deferred, it is done immediately in readpage,
+ * UBIFS, I/O is not deferred, it is done immediately in read_folio,
* which means the user would have to wait not just for their own I/O
* but the read-ahead I/O as well i.e. completely pointless.
*
@@ -2430,7 +2430,7 @@ static int __init ubifs_init(void)
if (!ubifs_inode_slab)
return -ENOMEM;
- err = register_shrinker(&ubifs_shrinker_info);
+ err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab");
if (err)
goto out_slab;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 008fa46ef61e..7d6d2f152e03 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -132,7 +132,7 @@
#define WORST_COMPR_FACTOR 2
#ifdef CONFIG_FS_ENCRYPTION
-#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
+#define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT
#else
#define UBIFS_CIPHER_BLOCK_SIZE 0
#endif
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e4f193eae4b2..e4c4761aff7f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -677,7 +677,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
int err;
err = security_inode_init_security(inode, dentry, qstr,
- &init_xattrs, 0);
+ &init_xattrs, NULL);
if (err) {
struct ubifs_info *c = dentry->i_sb->s_fs_info;
ubifs_err(c, "cannot initialize security for inode %lu, error %d",
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 42e3e551fa4c..cad3772f9dbe 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -130,7 +130,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
+ ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 73720320f0ab..a2adf6293093 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -89,7 +89,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
brelse(tmp);
}
if (num) {
- ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha);
+ ll_rw_block(REQ_OP_READ | REQ_RAHEAD, num, bha);
for (i = 0; i < num; i++)
brelse(bha[i]);
}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0f6bf2504437..5c659e23e578 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -57,11 +57,11 @@ static void __udf_adinicb_readpage(struct page *page)
kunmap_atomic(kaddr);
}
-static int udf_adinicb_readpage(struct file *file, struct page *page)
+static int udf_adinicb_read_folio(struct file *file, struct folio *folio)
{
- BUG_ON(!PageLocked(page));
- __udf_adinicb_readpage(page);
- unlock_page(page);
+ BUG_ON(!folio_test_locked(folio));
+ __udf_adinicb_readpage(&folio->page);
+ folio_unlock(folio);
return 0;
}
@@ -87,14 +87,14 @@ static int udf_adinicb_writepage(struct page *page,
static int udf_adinicb_write_begin(struct file *file,
struct address_space *mapping, loff_t pos,
- unsigned len, unsigned flags, struct page **pagep,
+ unsigned len, struct page **pagep,
void **fsdata)
{
struct page *page;
if (WARN_ON_ONCE(pos >= PAGE_SIZE))
return -EIO;
- page = grab_cache_page_write_begin(mapping, 0, flags);
+ page = grab_cache_page_write_begin(mapping, 0);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -127,7 +127,7 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin
const struct address_space_operations udf_adinicb_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = udf_adinicb_readpage,
+ .read_folio = udf_adinicb_read_folio,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
.write_end = udf_adinicb_write_end,
@@ -252,6 +252,7 @@ const struct file_operations udf_file_operations = {
.release = udf_release_file,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ca4fa710e562..8d06daed549f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -193,9 +193,9 @@ static int udf_writepages(struct address_space *mapping,
return mpage_writepages(mapping, wbc, udf_get_block);
}
-static int udf_readpage(struct file *file, struct page *page)
+static int udf_read_folio(struct file *file, struct folio *folio)
{
- return mpage_readpage(page, udf_get_block);
+ return mpage_read_folio(folio, udf_get_block);
}
static void udf_readahead(struct readahead_control *rac)
@@ -204,12 +204,12 @@ static void udf_readahead(struct readahead_control *rac)
}
static int udf_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
+ ret = block_write_begin(mapping, pos, len, pagep, udf_get_block);
if (unlikely(ret))
udf_write_failed(mapping, pos + len);
return ret;
@@ -237,7 +237,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations udf_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = udf_readpage,
+ .read_folio = udf_read_folio,
.readahead = udf_readahead,
.writepage = udf_writepage,
.writepages = udf_writepages,
@@ -1214,7 +1214,7 @@ struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
if (buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 0ed4861b038f..b3d5f97f16cd 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
if (fileident) {
if (adinicb || (offset + lfi < 0)) {
- memcpy(udf_get_fi_ident(sfi), fileident, lfi);
+ memcpy(sfi->impUse + liu, fileident, lfi);
} else if (offset >= 0) {
memcpy(fibh->ebh->b_data + offset, fileident, lfi);
} else {
- memcpy(udf_get_fi_ident(sfi), fileident, -offset);
+ memcpy(sfi->impUse + liu, fileident, -offset);
memcpy(fibh->ebh->b_data, fileident - offset,
lfi + offset);
}
@@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
offset += lfi;
if (adinicb || (offset + padlen < 0)) {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen);
+ memset(sfi->impUse + liu + lfi, 0x00, padlen);
} else if (offset >= 0) {
memset(fibh->ebh->b_data + offset, 0x00, padlen);
} else {
- memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset);
+ memset(sfi->impUse + liu + lfi, 0x00, -offset);
memset(fibh->ebh->b_data, 0x00, padlen + offset);
}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 9b223421a3c5..f3642f9c23f8 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -101,8 +101,9 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
return 0;
}
-static int udf_symlink_filler(struct file *file, struct page *page)
+static int udf_symlink_filler(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
struct buffer_head *bh = NULL;
unsigned char *symlink;
@@ -183,7 +184,7 @@ static int udf_symlink_getattr(struct user_namespace *mnt_userns,
* symlinks can't do much...
*/
const struct address_space_operations udf_symlink_aops = {
- .readpage = udf_symlink_filler,
+ .read_folio = udf_symlink_filler,
};
const struct inode_operations udf_symlink_inode_operations = {
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 075d3d9114c8..bd810d8239f2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -296,7 +296,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
if (!buffer_mapped(bh))
map_bh(bh, inode->i_sb, oldb + pos);
if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ll_rw_block(REQ_OP_READ, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ufs_error(inode->i_sb, __func__,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index b721d0bda5e5..391efaf1d528 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -193,7 +193,7 @@ static struct page *ufs_get_page(struct inode *dir, unsigned long n)
if (!IS_ERR(page)) {
kmap(page);
if (unlikely(!PageChecked(page))) {
- if (PageError(page) || !ufs_check_page(page))
+ if (!ufs_check_page(page))
goto fail;
}
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index d0dda01620f0..a873de7dec1c 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -390,7 +390,7 @@ out:
/**
* ufs_getfrag_block() - `get_block_t' function, interface between UFS and
- * readpage, writepage and so on
+ * read_folio, writepage and so on
*/
static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
@@ -472,9 +472,9 @@ static int ufs_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page,ufs_getfrag_block,wbc);
}
-static int ufs_readpage(struct file *file, struct page *page)
+static int ufs_read_folio(struct file *file, struct folio *folio)
{
- return block_read_full_page(page,ufs_getfrag_block);
+ return block_read_full_folio(folio, ufs_getfrag_block);
}
int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
@@ -495,13 +495,12 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
}
static int ufs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, flags, pagep,
- ufs_getfrag_block);
+ ret = block_write_begin(mapping, pos, len, pagep, ufs_getfrag_block);
if (unlikely(ret))
ufs_write_failed(mapping, pos + len);
@@ -528,7 +527,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
const struct address_space_operations ufs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .readpage = ufs_readpage,
+ .read_folio = ufs_read_folio,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
.write_end = ufs_write_end,
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 4fa633f84274..08ddf41eaaad 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -264,17 +264,6 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
put_page(page);
return NULL;
}
-
- if (!PageUptodate(page) || PageError(page)) {
- unlock_page(page);
- put_page(page);
-
- printk(KERN_ERR "ufs_change_blocknr: "
- "can not read page: ino %lu, index: %lu\n",
- inode->i_ino, index);
-
- return ERR_PTR(-EIO);
- }
}
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << inode->i_blkbits, 0);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index aa0c47cb0d16..0c1d33c4f74c 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -29,6 +29,7 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
+#include <linux/swapops.h>
int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -191,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg)
}
static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned long real_address,
unsigned int flags,
unsigned long reason,
unsigned int features)
{
struct uffd_msg msg;
+
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
- if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
- address &= PAGE_MASK;
- msg.arg.pagefault.address = address;
+ msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
+ real_address : address;
+
/*
* These flags indicate why the userfault occurred:
* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
@@ -249,9 +252,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (huge_pte_none(pte))
+ if (huge_pte_none_mostly(pte))
ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
ret = true;
@@ -330,9 +334,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte = pte_offset_map(pmd, address);
/*
* Lockless access: we're in a wait_event so it's ok if it
- * changes under us.
+ * changes under us. PTE markers should be handled the same as none
+ * ptes here.
*/
- if (pte_none(*pte))
+ if (pte_none_mostly(*pte))
ret = true;
if (!pte_write(*pte) && (reason & VM_UFFD_WP))
ret = true;
@@ -485,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
- ctx->features);
+ uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
+ reason, ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
@@ -986,7 +991,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
int fd;
fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
- O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
+ O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -1255,24 +1260,6 @@ static __always_inline int validate_range(struct mm_struct *mm,
return 0;
}
-static inline bool vma_can_userfault(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- /* FIXME: add WP support to hugetlbfs and shmem */
- if (vm_flags & VM_UFFD_WP) {
- if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
- return false;
- }
-
- if (vm_flags & VM_UFFD_MINOR) {
- if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
- return false;
- }
-
- return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
- vma_is_shmem(vma);
-}
-
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1614,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}
+ /* Reset ptes for the whole vma range if wr-protected */
+ if (userfaultfd_wp(vma))
+ uffd_wp_range(mm, vma, start, vma_end - start, false);
+
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
@@ -1938,10 +1929,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
- features = uffdio_api.features;
- ret = -EINVAL;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
- goto err_out;
+ /* Ignore unsupported features (userspace built against newer kernel) */
+ features = uffdio_api.features & UFFD_API_FEATURES;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
goto err_out;
@@ -1954,6 +1943,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
+#ifndef CONFIG_PTE_MARKER_UFFD_WP
+ uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
@@ -2102,7 +2094,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
mmgrab(ctx->mm);
fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index d74e0d336995..572aa1c43b37 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -225,8 +225,9 @@ const struct inode_operations vboxsf_reg_iops = {
.setattr = vboxsf_setattr
};
-static int vboxsf_readpage(struct file *file, struct page *page)
+static int vboxsf_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct vboxsf_handle *sf_handle = file->private_data;
loff_t off = page_offset(page);
u32 nread = PAGE_SIZE;
@@ -352,7 +353,7 @@ out:
* page and it does not call SetPageUptodate for partial writes.
*/
const struct address_space_operations vboxsf_reg_aops = {
- .readpage = vboxsf_readpage,
+ .read_folio = vboxsf_read_folio,
.writepage = vboxsf_writepage,
.dirty_folio = filemap_dirty_folio,
.write_begin = simple_write_begin,
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index 24d1b54de807..aad1f1d998b9 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -3,6 +3,7 @@
config FS_VERITY
bool "FS Verity (read-only file-based authenticity protection)"
select CRYPTO
+ select CRYPTO_HASH_INFO
# SHA-256 is implied as it's intended to be the default hash algorithm.
# To avoid bloat, other wanted algorithms must be selected explicitly.
# Note that CRYPTO_SHA256 denotes the generic C implementation, but
@@ -13,11 +14,11 @@ config FS_VERITY
help
This option enables fs-verity. fs-verity is the dm-verity
mechanism implemented at the file level. On supported
- filesystems (currently EXT4 and F2FS), userspace can use an
- ioctl to enable verity for a file, which causes the filesystem
- to build a Merkle tree for the file. The filesystem will then
- transparently verify any data read from the file against the
- Merkle tree. The file is also made read-only.
+ filesystems (currently ext4, f2fs, and btrfs), userspace can
+ use an ioctl to enable verity for a file, which causes the
+ filesystem to build a Merkle tree for the file. The filesystem
+ will then transparently verify any data read from the file
+ against the Merkle tree. The file is also made read-only.
This serves as an integrity check, but the availability of the
Merkle tree root hash also allows efficiently supporting
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 60a4372aa4d7..df6b499bf6a1 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -18,27 +18,26 @@
* Read a file data page for Merkle tree construction. Do aggressive readahead,
* since we're sequentially reading the entire file.
*/
-static struct page *read_file_data_page(struct file *filp, pgoff_t index,
+static struct page *read_file_data_page(struct file *file, pgoff_t index,
struct file_ra_state *ra,
unsigned long remaining_pages)
{
- struct page *page;
+ DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, index);
+ struct folio *folio;
- page = find_get_page_flags(filp->f_mapping, index, FGP_ACCESSED);
- if (!page || !PageUptodate(page)) {
- if (page)
- put_page(page);
+ folio = __filemap_get_folio(ractl.mapping, index, FGP_ACCESSED, 0);
+ if (!folio || !folio_test_uptodate(folio)) {
+ if (folio)
+ folio_put(folio);
else
- page_cache_sync_readahead(filp->f_mapping, ra, filp,
- index, remaining_pages);
- page = read_mapping_page(filp->f_mapping, index, NULL);
- if (IS_ERR(page))
- return page;
+ page_cache_sync_ra(&ractl, remaining_pages);
+ folio = read_cache_folio(ractl.mapping, index, NULL, file);
+ if (IS_ERR(folio))
+ return &folio->page;
}
- if (PageReadahead(page))
- page_cache_async_readahead(filp->f_mapping, ra, filp, page,
- index, remaining_pages);
- return page;
+ if (folio_test_readahead(folio))
+ page_cache_async_ra(&ractl, folio, remaining_pages);
+ return folio_file_page(folio, index);
}
static int build_merkle_tree_level(struct file *filp, unsigned int level,
@@ -202,7 +201,7 @@ static int enable_verity(struct file *filp,
const struct fsverity_operations *vops = inode->i_sb->s_vop;
struct merkle_tree_params params = { };
struct fsverity_descriptor *desc;
- size_t desc_size = sizeof(*desc) + arg->sig_size;
+ size_t desc_size = struct_size(desc, signature, arg->sig_size);
struct fsverity_info *vi;
int err;
@@ -281,7 +280,7 @@ static int enable_verity(struct file *filp,
* from disk. This is simpler, and it serves as an extra check that the
* metadata we're writing is valid before actually enabling verity.
*/
- vi = fsverity_create_info(inode, desc, desc_size);
+ vi = fsverity_create_info(inode, desc);
if (IS_ERR(vi)) {
err = PTR_ERR(vi);
goto rollback;
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index a7920434bae5..dbe1ce5b450a 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -14,7 +14,6 @@
#define pr_fmt(fmt) "fs-verity: " fmt
-#include <crypto/sha2.h>
#include <linux/fsverity.h>
#include <linux/mempool.h>
@@ -26,12 +25,6 @@ struct ahash_request;
*/
#define FS_VERITY_MAX_LEVELS 8
-/*
- * Largest digest size among all hash algorithms supported by fs-verity.
- * Currently assumed to be <= size of fsverity_descriptor::root_hash.
- */
-#define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
-
/* A hash algorithm supported by fs-verity */
struct fsverity_hash_alg {
struct crypto_ahash *tfm; /* hash tfm, allocated on demand */
@@ -77,8 +70,6 @@ struct fsverity_info {
const struct inode *inode;
};
-/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
-#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
sizeof(struct fsverity_descriptor))
@@ -122,16 +113,14 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
const u8 *salt, size_t salt_size);
struct fsverity_info *fsverity_create_info(const struct inode *inode,
- struct fsverity_descriptor *desc,
- size_t desc_size);
+ struct fsverity_descriptor *desc);
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
void fsverity_free_info(struct fsverity_info *vi);
int fsverity_get_descriptor(struct inode *inode,
- struct fsverity_descriptor **desc_ret,
- size_t *desc_size_ret);
+ struct fsverity_descriptor **desc_ret);
int __init fsverity_init_info_cache(void);
void __init fsverity_exit_info_cache(void);
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index f0d7b30c62db..e99c00350c28 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -57,3 +57,46 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
return 0;
}
EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
+
+/**
+ * fsverity_get_digest() - get a verity file's digest
+ * @inode: inode to get digest of
+ * @digest: (out) pointer to the digest
+ * @alg: (out) pointer to the hash algorithm enumeration
+ *
+ * Return the file hash algorithm and digest of an fsverity protected file.
+ * Assumption: before calling fsverity_get_digest(), the file must have been
+ * opened.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_get_digest(struct inode *inode,
+ u8 digest[FS_VERITY_MAX_DIGEST_SIZE],
+ enum hash_algo *alg)
+{
+ const struct fsverity_info *vi;
+ const struct fsverity_hash_alg *hash_alg;
+ int i;
+
+ vi = fsverity_get_info(inode);
+ if (!vi)
+ return -ENODATA; /* not a verity file */
+
+ hash_alg = vi->tree_params.hash_alg;
+ memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE);
+
+ /* convert the verity hash algorithm name to a hash_algo_name enum */
+ i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name);
+ if (i < 0)
+ return -EINVAL;
+ *alg = i;
+
+ if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg]))
+ return -EINVAL;
+ memcpy(digest, vi->file_digest, hash_alg->digest_size);
+
+ pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg],
+ hash_digest_size[*alg], digest);
+
+ return 0;
+}
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 92df87f5fa38..81ff94442f7b 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -147,8 +147,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
* fsverity_descriptor must have already undergone basic validation.
*/
struct fsverity_info *fsverity_create_info(const struct inode *inode,
- struct fsverity_descriptor *desc,
- size_t desc_size)
+ struct fsverity_descriptor *desc)
{
struct fsverity_info *vi;
int err;
@@ -264,8 +263,7 @@ static bool validate_fsverity_descriptor(struct inode *inode,
* the filesystem, and do basic validation of it.
*/
int fsverity_get_descriptor(struct inode *inode,
- struct fsverity_descriptor **desc_ret,
- size_t *desc_size_ret)
+ struct fsverity_descriptor **desc_ret)
{
int res;
struct fsverity_descriptor *desc;
@@ -297,7 +295,6 @@ int fsverity_get_descriptor(struct inode *inode,
}
*desc_ret = desc;
- *desc_size_ret = res;
return 0;
}
@@ -306,17 +303,16 @@ static int ensure_verity_info(struct inode *inode)
{
struct fsverity_info *vi = fsverity_get_info(inode);
struct fsverity_descriptor *desc;
- size_t desc_size;
int err;
if (vi)
return 0;
- err = fsverity_get_descriptor(inode, &desc, &desc_size);
+ err = fsverity_get_descriptor(inode, &desc);
if (err)
return err;
- vi = fsverity_create_info(inode, desc, desc_size);
+ vi = fsverity_create_info(inode, desc);
if (IS_ERR(vi)) {
err = PTR_ERR(vi);
goto out_free_desc;
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 7e2d0c7bdf0d..2aefc5565152 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -53,14 +53,14 @@ static int fsverity_read_merkle_tree(struct inode *inode,
break;
}
- virt = kmap(page);
+ virt = kmap_local_page(page);
if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
- kunmap(page);
+ kunmap_local(virt);
put_page(page);
err = -EFAULT;
break;
}
- kunmap(page);
+ kunmap_local(virt);
put_page(page);
retval += bytes_to_copy;
@@ -101,7 +101,7 @@ static int fsverity_read_descriptor(struct inode *inode,
size_t desc_size;
int res;
- res = fsverity_get_descriptor(inode, &desc, &desc_size);
+ res = fsverity_get_descriptor(inode, &desc);
if (res)
return res;
@@ -119,10 +119,9 @@ static int fsverity_read_signature(struct inode *inode,
void __user *buf, u64 offset, int length)
{
struct fsverity_descriptor *desc;
- size_t desc_size;
int res;
- res = fsverity_get_descriptor(inode, &desc, &desc_size);
+ res = fsverity_get_descriptor(inode, &desc);
if (res)
return res;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 14e2fb49cff5..bde8c9b7d25f 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -39,16 +39,6 @@ static void hash_at_level(const struct merkle_tree_params *params,
(params->log_blocksize - params->log_arity);
}
-/* Extract a hash from a hash page */
-static void extract_hash(struct page *hpage, unsigned int hoffset,
- unsigned int hsize, u8 *out)
-{
- void *virt = kmap_atomic(hpage);
-
- memcpy(out, virt + hoffset, hsize);
- kunmap_atomic(virt);
-}
-
static inline int cmp_hashes(const struct fsverity_info *vi,
const u8 *want_hash, const u8 *real_hash,
pgoff_t index, int level)
@@ -129,7 +119,7 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
}
if (PageChecked(hpage)) {
- extract_hash(hpage, hoffset, hsize, _want_hash);
+ memcpy_from_page(_want_hash, hpage, hoffset, hsize);
want_hash = _want_hash;
put_page(hpage);
pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n",
@@ -158,7 +148,7 @@ descend:
if (err)
goto out;
SetPageChecked(hpage);
- extract_hash(hpage, hoffset, hsize, _want_hash);
+ memcpy_from_page(_want_hash, hpage, hoffset, hsize);
want_hash = _want_hash;
put_page(hpage);
pr_debug("Verified hash page at level %d, now want %s:%*phN\n",
diff --git a/fs/xattr.c b/fs/xattr.c
index 5c8c5175b385..61107b6bbed2 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -25,6 +25,8 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
@@ -280,6 +282,12 @@ out:
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
+static inline bool is_posix_acl_xattr(const char *name)
+{
+ return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
+ (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
+}
+
int
vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
const char *name, const void *value, size_t size, int flags)
@@ -429,7 +437,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return ret;
}
nolsm:
- return __vfs_getxattr(dentry, inode, name, value, size);
+ error = __vfs_getxattr(dentry, inode, name, value, size);
+ if (error > 0 && is_posix_acl_xattr(name))
+ posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size);
+ return error;
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
@@ -539,43 +550,73 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
/*
* Extended attribute SET operations
*/
-static long
-setxattr(struct user_namespace *mnt_userns, struct dentry *d,
- const char __user *name, const void __user *value, size_t size,
- int flags)
+
+int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
{
int error;
- void *kvalue = NULL;
- char kname[XATTR_NAME_MAX + 1];
- if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
+ if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
return -EINVAL;
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
+ error = strncpy_from_user(ctx->kname->name, name,
+ sizeof(ctx->kname->name));
+ if (error == 0 || error == sizeof(ctx->kname->name))
+ return -ERANGE;
if (error < 0)
return error;
- if (size) {
- if (size > XATTR_SIZE_MAX)
+ error = 0;
+ if (ctx->size) {
+ if (ctx->size > XATTR_SIZE_MAX)
return -E2BIG;
- kvalue = kvmalloc(size, GFP_KERNEL);
- if (!kvalue)
- return -ENOMEM;
- if (copy_from_user(kvalue, value, size)) {
- error = -EFAULT;
- goto out;
+
+ ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size);
+ if (IS_ERR(ctx->kvalue)) {
+ error = PTR_ERR(ctx->kvalue);
+ ctx->kvalue = NULL;
}
- if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size);
}
- error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags);
-out:
- kvfree(kvalue);
+ return error;
+}
+
+static void setxattr_convert(struct user_namespace *mnt_userns,
+ struct dentry *d, struct xattr_ctx *ctx)
+{
+ if (ctx->size && is_posix_acl_xattr(ctx->kname->name))
+ posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
+}
+int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct xattr_ctx *ctx)
+{
+ setxattr_convert(mnt_userns, dentry, ctx);
+ return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
+ ctx->kvalue, ctx->size, ctx->flags);
+}
+
+static long
+setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ const char __user *name, const void __user *value, size_t size,
+ int flags)
+{
+ struct xattr_name kname;
+ struct xattr_ctx ctx = {
+ .cvalue = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = flags,
+ };
+ int error;
+
+ error = setxattr_copy(name, &ctx);
+ if (error)
+ return error;
+
+ error = do_setxattr(mnt_userns, d, &ctx);
+
+ kvfree(ctx.kvalue);
return error;
}
@@ -641,43 +682,59 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
/*
* Extended attribute GET operations
*/
-static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
- const char __user *name, void __user *value, size_t size)
+ssize_t
+do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ struct xattr_ctx *ctx)
{
ssize_t error;
- void *kvalue = NULL;
- char kname[XATTR_NAME_MAX + 1];
+ char *kname = ctx->kname->name;
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
-
- if (size) {
- if (size > XATTR_SIZE_MAX)
- size = XATTR_SIZE_MAX;
- kvalue = kvzalloc(size, GFP_KERNEL);
- if (!kvalue)
+ if (ctx->size) {
+ if (ctx->size > XATTR_SIZE_MAX)
+ ctx->size = XATTR_SIZE_MAX;
+ ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL);
+ if (!ctx->kvalue)
return -ENOMEM;
}
- error = vfs_getxattr(mnt_userns, d, kname, kvalue, size);
+ error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
if (error > 0) {
- if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error);
- if (size && copy_to_user(value, kvalue, error))
+ if (is_posix_acl_xattr(kname))
+ posix_acl_fix_xattr_to_user(ctx->kvalue, error);
+ if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
error = -EFAULT;
- } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+ } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
/* The file system tried to returned a value bigger
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
- kvfree(kvalue);
+ return error;
+}
+
+static ssize_t
+getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+ const char __user *name, void __user *value, size_t size)
+{
+ ssize_t error;
+ struct xattr_name kname;
+ struct xattr_ctx ctx = {
+ .value = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = 0,
+ };
+
+ error = strncpy_from_user(kname.name, name, sizeof(kname.name));
+ if (error == 0 || error == sizeof(kname.name))
+ error = -ERANGE;
+ if (error < 0)
+ return error;
+
+ error = do_getxattr(mnt_userns, d, &ctx);
+ kvfree(ctx.kvalue);
return error;
}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 04611a1068b4..03135a1c31b6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -102,9 +102,11 @@ xfs-y += xfs_log.o \
xfs_buf_item_recover.o \
xfs_dquot_item_recover.o \
xfs_extfree_item.o \
+ xfs_attr_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
xfs_inode_item_recover.o \
+ xfs_iunlink_item.o \
xfs_refcount_item.o \
xfs_rmap_item.o \
xfs_log_recover.o \
@@ -128,6 +130,11 @@ xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
+# notify failure
+ifeq ($(CONFIG_MEMORY_FAILURE),y)
+xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
+endif
+
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 1e4ee042d52f..bb0c700afe3c 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -120,18 +120,18 @@ xfs_initialize_perag_data(
for (index = 0; index < agcount; index++) {
/*
- * read the agf, then the agi. This gets us
- * all the information we need and populates the
- * per-ag structures for us.
+ * Read the AGF and AGI buffers to populate the per-ag
+ * structures for us.
*/
- error = xfs_alloc_pagf_init(mp, NULL, index, 0);
- if (error)
+ pag = xfs_perag_get(mp, index);
+ error = xfs_alloc_read_agf(pag, NULL, 0, NULL);
+ if (!error)
+ error = xfs_ialloc_read_agi(pag, NULL, NULL);
+ if (error) {
+ xfs_perag_put(pag);
return error;
+ }
- error = xfs_ialloc_pagi_init(mp, NULL, index);
- if (error)
- return error;
- pag = xfs_perag_get(mp, index);
ifree += pag->pagi_freecount;
ialloc += pag->pagi_count;
bfree += pag->pagf_freeblks;
@@ -173,7 +173,6 @@ __xfs_free_perag(
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
- ASSERT(atomic_read(&pag->pag_ref) == 0);
kmem_free(pag);
}
@@ -192,20 +191,79 @@ xfs_free_perag(
pag = radix_tree_delete(&mp->m_perag_tree, agno);
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
- ASSERT(atomic_read(&pag->pag_ref) == 0);
+ XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
+/* Find the size of the AG, in blocks. */
+static xfs_agblock_t
+__xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks)
+{
+ ASSERT(agno < agcount);
+
+ if (agno < agcount - 1)
+ return mp->m_sb.sb_agblocks;
+ return dblocks - (agno * mp->m_sb.sb_agblocks);
+}
+
+xfs_agblock_t
+xfs_ag_block_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks);
+}
+
+/* Calculate the first and last possible inode number in an AG. */
+static void
+__xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agblock_t eoag,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ xfs_agblock_t bno;
+
+ /*
+ * Calculate the first inode, which will be in the first
+ * cluster-aligned block after the AGFL.
+ */
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+ *first = XFS_AGB_TO_AGINO(mp, bno);
+
+ /*
+ * Calculate the last inode, which will be at the end of the
+ * last (aligned) cluster that can be allocated in the AG.
+ */
+ bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+ *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
+}
+
+void
+xfs_agino_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t *first,
+ xfs_agino_t *last)
+{
+ return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
+}
+
int
xfs_initialize_perag(
struct xfs_mount *mp,
xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks,
xfs_agnumber_t *maxagi)
{
struct xfs_perag *pag;
@@ -264,13 +322,18 @@ xfs_initialize_perag(
if (error)
goto out_remove_pag;
- error = xfs_iunlink_init(pag);
- if (error)
- goto out_hash_destroy;
-
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+
+ /*
+ * Pre-calculated geometry
+ */
+ pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+ dblocks);
+ pag->min_block = XFS_AGFL_BLOCK(mp);
+ __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -281,8 +344,6 @@ xfs_initialize_perag(
mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
return 0;
-out_hash_destroy:
- xfs_buf_hash_destroy(pag);
out_remove_pag:
radix_tree_delete(&mp->m_perag_tree, index);
out_free_pag:
@@ -294,7 +355,6 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
- xfs_iunlink_destroy(pag);
kmem_free(pag);
}
return error;
@@ -322,12 +382,6 @@ xfs_get_aghdr_buf(
return 0;
}
-static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
-{
- return mp->m_sb.sb_logstart > 0 &&
- id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
-}
-
/*
* Generic btree root block init function
*/
@@ -353,7 +407,7 @@ xfs_freesp_init_recs(
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
struct xfs_alloc_rec *nrec;
xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp,
mp->m_sb.sb_logstart);
@@ -480,7 +534,7 @@ xfs_rmaproot_init(
}
/* account for the log space */
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
rrec = XFS_RMAP_REC_ADDR(block,
be16_to_cpu(block->bb_numrecs) + 1);
rrec->rm_startblock = cpu_to_be32(
@@ -551,7 +605,7 @@ xfs_agfblock_init(
agf->agf_refcount_blocks = cpu_to_be32(1);
}
- if (is_log_ag(mp, id)) {
+ if (xfs_ag_contains_log(mp, id->agno)) {
int64_t logblocks = mp->m_sb.sb_logblocks;
be32_add_cpu(&agf->agf_freeblks, -logblocks);
@@ -762,11 +816,11 @@ xfs_ag_init_headers(
int
xfs_ag_shrink_space(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans **tpp,
- xfs_agnumber_t agno,
xfs_extlen_t delta)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_alloc_arg args = {
.tp = *tpp,
.mp = mp,
@@ -783,14 +837,14 @@ xfs_ag_shrink_space(
xfs_agblock_t aglen;
int error, err2;
- ASSERT(agno == mp->m_sb.sb_agcount - 1);
- error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp);
+ ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+ error = xfs_ialloc_read_agi(pag, *tpp, &agibp);
if (error)
return error;
agi = agibp->b_addr;
- error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp);
+ error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp);
if (error)
return error;
@@ -802,13 +856,14 @@ xfs_ag_shrink_space(
if (delta >= aglen)
return -EINVAL;
- args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta);
+ args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta);
/*
* Make sure that the last inode cluster cannot overlap with the new
* end of the AG, even if it's sparse.
*/
- error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta);
+ error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp,
+ aglen - delta);
if (error)
return error;
@@ -816,7 +871,7 @@ xfs_ag_shrink_space(
* Disable perag reservations so it doesn't cause the allocation request
* to fail. We'll reestablish reservation before we return.
*/
- error = xfs_ag_resv_free(agibp->b_pag);
+ error = xfs_ag_resv_free(pag);
if (error)
return error;
@@ -845,7 +900,7 @@ xfs_ag_shrink_space(
be32_add_cpu(&agi->agi_length, -delta);
be32_add_cpu(&agf->agf_length, -delta);
- err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
+ err2 = xfs_ag_resv_init(pag, *tpp);
if (err2) {
be32_add_cpu(&agi->agi_length, delta);
be32_add_cpu(&agf->agf_length, delta);
@@ -869,8 +924,9 @@ xfs_ag_shrink_space(
xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
return 0;
+
resv_init_out:
- err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
+ err2 = xfs_ag_resv_init(pag, *tpp);
if (!err2)
return error;
resv_err:
@@ -884,9 +940,8 @@ resv_err:
*/
int
xfs_ag_extend_space(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- struct aghdr_init_data *id,
xfs_extlen_t len)
{
struct xfs_buf *bp;
@@ -894,23 +949,20 @@ xfs_ag_extend_space(
struct xfs_agf *agf;
int error;
- /*
- * Change the agi length.
- */
- error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
+ ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+
+ error = xfs_ialloc_read_agi(pag, tp, &bp);
if (error)
return error;
agi = bp->b_addr;
be32_add_cpu(&agi->agi_length, len);
- ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
- be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
/*
* Change agf length.
*/
- error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &bp);
if (error)
return error;
@@ -925,49 +977,49 @@ xfs_ag_extend_space(
* XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
* this doesn't actually exist in the rmap btree.
*/
- error = xfs_rmap_free(tp, bp, bp->b_pag,
- be32_to_cpu(agf->agf_length) - len,
+ error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len,
len, &XFS_RMAP_OINFO_SKIP_UPDATE);
if (error)
return error;
- return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
+ error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno,
be32_to_cpu(agf->agf_length) - len),
len, &XFS_RMAP_OINFO_SKIP_UPDATE,
XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
+ /* Update perag geometry */
+ pag->block_count = be32_to_cpu(agf->agf_length);
+ __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ return 0;
}
/* Retrieve AG geometry. */
int
xfs_ag_get_geometry(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_ag_geometry *ageo)
{
struct xfs_buf *agi_bp;
struct xfs_buf *agf_bp;
struct xfs_agi *agi;
struct xfs_agf *agf;
- struct xfs_perag *pag;
unsigned int freeblks;
int error;
- if (agno >= mp->m_sb.sb_agcount)
- return -EINVAL;
-
/* Lock the AG headers. */
- error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, NULL, &agi_bp);
if (error)
return error;
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp);
if (error)
goto out_agi;
- pag = agi_bp->b_pag;
-
/* Fill out form. */
memset(ageo, 0, sizeof(*ageo));
- ageo->ag_number = agno;
+ ageo->ag_number = pag->pag_agno;
agi = agi_bp->b_addr;
ageo->ag_icount = be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index e411d51c2589..517a138faa66 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -67,6 +67,12 @@ struct xfs_perag {
/* for rcu-safe freeing */
struct rcu_head rcu_head;
+ /* Precalculated geometry info */
+ xfs_agblock_t block_count;
+ xfs_agblock_t min_block;
+ xfs_agino_t agino_min;
+ xfs_agino_t agino_max;
+
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
@@ -97,17 +103,11 @@ struct xfs_perag {
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
- /*
- * Unlinked inode information. This incore information reflects
- * data stored in the AGI, so callers must hold the AGI buffer lock
- * or have some other means to control concurrency.
- */
- struct rhashtable pagi_unlinked_hash;
#endif /* __KERNEL__ */
};
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
- xfs_agnumber_t *maxagi);
+ xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
void xfs_free_perag(struct xfs_mount *mp);
@@ -117,6 +117,56 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
void xfs_perag_put(struct xfs_perag *pag);
/*
+ * Per-ag geometry infomation and validation
+ */
+xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
+void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t *first, xfs_agino_t *last);
+
+static inline bool
+xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
+{
+ if (agbno >= pag->block_count)
+ return false;
+ if (agbno <= pag->min_block)
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata.
+ */
+static inline bool
+xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino < pag->agino_min)
+ return false;
+ if (agino > pag->agino_max)
+ return false;
+ return true;
+}
+
+/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+static inline bool
+xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino)
+{
+ if (agino == NULLAGINO)
+ return true;
+ return xfs_verify_agino(pag, agino);
+}
+
+static inline bool
+xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+ return mp->m_sb.sb_logstart > 0 &&
+ agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
+/*
* Perag iteration APIs
*/
static inline struct xfs_perag *
@@ -168,11 +218,10 @@ struct aghdr_init_data {
};
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
-int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans **tpp,
- xfs_agnumber_t agno, xfs_extlen_t delta);
-int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
- struct aghdr_init_data *id, xfs_extlen_t len);
-int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_ag_geometry *ageo);
+int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp,
+ xfs_extlen_t delta);
+int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
+ xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fe94058d4e9e..5af123d13a63 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -322,7 +322,7 @@ out:
* address.
*/
if (has_resv) {
- error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
+ error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
if (error2)
return error2;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b52ed339727f..e2bdf089c0a3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_prealloc_blocks(
/*
* The number of blocks per AG that we withhold from xfs_mod_fdblocks to
* guarantee that we can refill the AGFL prior to allocating space in a nearly
- * full AG. Although the the space described by the free space btrees, the
+ * full AG. Although the space described by the free space btrees, the
* blocks used by the freesp btrees themselves, and the blocks owned by the
* AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
* free space in the AG drop so low that the free space btrees cannot refill an
@@ -248,7 +248,7 @@ xfs_alloc_get_rec(
int *stat) /* output: success/failure */
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
@@ -263,11 +263,11 @@ xfs_alloc_get_rec(
goto out_bad_rec;
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, *bno))
+ if (!xfs_verify_agbno(pag, *bno))
goto out_bad_rec;
if (*bno > *bno + *len)
goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
+ if (!xfs_verify_agbno(pag, *bno + *len - 1))
goto out_bad_rec;
return 0;
@@ -275,7 +275,8 @@ xfs_alloc_get_rec(
out_bad_rec:
xfs_warn(mp,
"%s Freespace BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
+ cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
+ pag->pag_agno);
xfs_warn(mp,
"start block 0x%x block count 0x%x", *bno, *len);
return -EFSCORRUPTED;
@@ -703,20 +704,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = {
/*
* Read in the allocation group free block array.
*/
-int /* error */
+int
xfs_alloc_read_agfl(
- xfs_mount_t *mp, /* mount point structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* buffer for the ag free block array */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **bpp)
{
- struct xfs_buf *bp; /* return value */
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_buf *bp;
+ int error;
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
@@ -1075,7 +1075,8 @@ xfs_alloc_ag_vextent_small(
be32_to_cpu(agf->agf_flcount) <= args->minleft)
goto out;
- error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
+ &fbno, 0);
if (error)
goto error;
if (fbno == NULLAGBLOCK)
@@ -2511,7 +2512,7 @@ __xfs_free_extent_later(
ASSERT(bno != NULLFSBLOCK);
ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
agno = XFS_FSB_TO_AGNO(mp, bno);
agbno = XFS_FSB_TO_AGBNO(mp, bno);
@@ -2609,7 +2610,7 @@ xfs_alloc_fix_freelist(
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
if (!pag->pagf_init) {
- error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2639,7 +2640,7 @@ xfs_alloc_fix_freelist(
* Can fail if we're not blocking on locks, and it's held.
*/
if (!agbp) {
- error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
if (error == -EAGAIN)
@@ -2697,7 +2698,7 @@ xfs_alloc_fix_freelist(
else
targs.oinfo = XFS_RMAP_OINFO_AG;
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
- error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+ error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
@@ -2712,7 +2713,7 @@ xfs_alloc_fix_freelist(
targs.alignment = targs.minlen = targs.prod = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
- error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
goto out_agbp_relse;
@@ -2741,7 +2742,7 @@ xfs_alloc_fix_freelist(
* Put each allocated block on the list.
*/
for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
- error = xfs_alloc_put_freelist(tp, agbp,
+ error = xfs_alloc_put_freelist(pag, tp, agbp,
agflbp, bno, 0);
if (error)
goto out_agflbp_relse;
@@ -2767,6 +2768,7 @@ out_no_agbp:
*/
int
xfs_alloc_get_freelist(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
xfs_agblock_t *bnop,
@@ -2777,9 +2779,8 @@ xfs_alloc_get_freelist(
xfs_agblock_t bno;
__be32 *agfl_bno;
int error;
- int logflags;
+ uint32_t logflags;
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
/*
* Freelist is empty, give up.
@@ -2791,8 +2792,7 @@ xfs_alloc_get_freelist(
/*
* Read the array of free blocks.
*/
- error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
- &agflbp);
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
return error;
@@ -2807,7 +2807,6 @@ xfs_alloc_get_freelist(
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
- pag = agbp->b_pag;
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, -1);
pag->pagf_flcount--;
@@ -2830,9 +2829,9 @@ xfs_alloc_get_freelist(
*/
void
xfs_alloc_log_agf(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields) /* mask of fields to be logged (XFS_AGF_...) */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte offset */
int last; /* last byte offset */
@@ -2868,29 +2867,11 @@ xfs_alloc_log_agf(
}
/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int /* error */
-xfs_alloc_pagf_init(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags) /* XFS_ALLOC_FLAGS_... */
-{
- struct xfs_buf *bp;
- int error;
-
- error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
- if (!error)
- xfs_trans_brelse(tp, bp);
- return error;
-}
-
-/*
* Put the block on the freelist for the allocation group.
*/
int
xfs_alloc_put_freelist(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
struct xfs_buf *agflbp,
@@ -2899,21 +2880,22 @@ xfs_alloc_put_freelist(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_agf *agf = agbp->b_addr;
- struct xfs_perag *pag;
__be32 *blockp;
int error;
- int logflags;
+ uint32_t logflags;
__be32 *agfl_bno;
int startoff;
- if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
- be32_to_cpu(agf->agf_seqno), &agflbp)))
- return error;
+ if (!agflbp) {
+ error = xfs_alloc_read_agfl(pag, tp, &agflbp);
+ if (error)
+ return error;
+ }
+
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
- pag = agbp->b_pag;
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, 1);
pag->pagf_flcount++;
@@ -3070,61 +3052,57 @@ const struct xfs_buf_ops xfs_agf_buf_ops = {
/*
* Read in the allocation group header (free/alloc section).
*/
-int /* error */
+int
xfs_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_BUF_ */
- struct xfs_buf **bpp) /* buffer for the ag freelist header */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
{
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
- trace_xfs_read_agf(mp, agno);
+ trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
if (error)
return error;
- ASSERT(!(*bpp)->b_error);
- xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+ xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
return 0;
}
/*
- * Read in the allocation group header (free/alloc section).
+ * Read in the allocation group header (free/alloc section) and initialise the
+ * perag structure if necessary. If the caller provides @agfbpp, then return the
+ * locked buffer to the caller, otherwise free it.
*/
-int /* error */
+int
xfs_alloc_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- struct xfs_buf **bpp) /* buffer for the ag freelist header */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ int flags,
+ struct xfs_buf **agfbpp)
{
- struct xfs_agf *agf; /* ag freelist header */
- struct xfs_perag *pag; /* per allocation group data */
+ struct xfs_buf *agfbp;
+ struct xfs_agf *agf;
int error;
int allocbt_blks;
- trace_xfs_alloc_read_agf(mp, agno);
+ trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
/* We don't support trylock when freeing. */
ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
- ASSERT(agno != NULLAGNUMBER);
- error = xfs_read_agf(mp, tp, agno,
+ error = xfs_read_agf(pag, tp,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
- bpp);
+ &agfbp);
if (error)
return error;
- ASSERT(!(*bpp)->b_error);
- agf = (*bpp)->b_addr;
- pag = (*bpp)->b_pag;
+ agf = agfbp->b_addr;
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -3138,7 +3116,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
pag->pagf_init = 1;
- pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
+ pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
/*
* Update the in-core allocbt counter. Filter out the rmapbt
@@ -3148,13 +3126,14 @@ xfs_alloc_read_agf(
* counter only tracks non-root blocks.
*/
allocbt_blks = pag->pagf_btreeblks;
- if (xfs_has_rmapbt(mp))
+ if (xfs_has_rmapbt(pag->pag_mount))
allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
if (allocbt_blks > 0)
- atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
+ atomic64_add(allocbt_blks,
+ &pag->pag_mount->m_allocbt_blks);
}
#ifdef DEBUG
- else if (!xfs_is_shutdown(mp)) {
+ else if (!xfs_is_shutdown(pag->pag_mount)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
@@ -3165,6 +3144,10 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
}
#endif
+ if (agfbpp)
+ *agfbpp = agfbp;
+ else
+ xfs_trans_brelse(tp, agfbp);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d4c057b764f9..2c3f762dfb58 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -95,6 +95,11 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag,
xfs_extlen_t need, xfs_extlen_t reserved);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
+int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk);
+int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf *agfbp, struct xfs_buf *agflbp,
+ xfs_agblock_t bno, int btreeblk);
/*
* Compute and fill in value of m_alloc_maxlevels.
@@ -104,56 +109,13 @@ xfs_alloc_compute_maxlevels(
struct xfs_mount *mp); /* file system mount structure */
/*
- * Get a block from the freelist.
- * Returns with the buffer for the block gotten.
- */
-int /* error */
-xfs_alloc_get_freelist(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer containing the agf structure */
- xfs_agblock_t *bnop, /* block address retrieved from freelist */
- int btreeblk); /* destination is a AGF btree */
-
-/*
* Log the given fields from the agf structure.
*/
void
xfs_alloc_log_agf(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields);/* mask of fields to be logged (XFS_AGF_...) */
-
-/*
- * Interface for inode allocation to force the pag data to be initialized.
- */
-int /* error */
-xfs_alloc_pagf_init(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags); /* XFS_ALLOC_FLAGS_... */
-
-/*
- * Put the block on the freelist for the allocation group.
- */
-int /* error */
-xfs_alloc_put_freelist(
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for a.g. freelist header */
- struct xfs_buf *agflbp,/* buffer for a.g. free block array */
- xfs_agblock_t bno, /* block being freed */
- int btreeblk); /* owner was a AGF btree */
-
-/*
- * Read in the allocation group header (free/alloc section).
- */
-int /* error */
-xfs_alloc_read_agf(
- struct xfs_mount *mp, /* mount point structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- struct xfs_buf **bpp); /* buffer for the ag freelist header */
+ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
/*
* Allocate an extent (variable-size).
@@ -206,10 +168,12 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
-int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, struct xfs_buf **bpp);
+int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
+ struct xfs_buf **agfbpp);
+int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **bpp);
int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
struct xfs_buf *, struct xfs_owner_info *);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 8c9f73cc0bee..549a3cba0234 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -60,8 +60,8 @@ xfs_allocbt_alloc_block(
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
- &bno, 1);
+ error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp, &bno, 1);
if (error)
return error;
@@ -71,7 +71,7 @@ xfs_allocbt_alloc_block(
}
atomic64_inc(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
new->s = cpu_to_be32(bno);
@@ -89,7 +89,8 @@ xfs_allocbt_free_block(
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
- error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
+ bno, 1);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 23523b802539..e28d93d232de 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -24,6 +24,10 @@
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+
+struct kmem_cache *xfs_attr_intent_cache;
/*
* xfs_attr.c
@@ -46,33 +50,27 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
-STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
+STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args);
/*
* Internal routines when attribute list is more than one block.
*/
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
-STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
- struct xfs_da_state **state);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp);
-STATIC int xfs_attr_node_removename(struct xfs_da_args *args,
- struct xfs_da_state *state);
+static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr);
+STATIC int xfs_attr_node_lookup(struct xfs_da_args *args,
+ struct xfs_da_state *state);
int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
- if (!XFS_IFORK_Q(ip) ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0))
+ if (!xfs_inode_has_attr_fork(ip))
+ return 0;
+ if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0)
return 0;
return 1;
}
@@ -85,7 +83,7 @@ bool
xfs_attr_is_leaf(
struct xfs_inode *ip)
{
- struct xfs_ifork *ifp = ip->i_afp;
+ struct xfs_ifork *ifp = &ip->i_af;
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec imap;
@@ -97,6 +95,123 @@ xfs_attr_is_leaf(
return imap.br_startoff == 0 && imap.br_blockcount == 1;
}
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level;
+
+ trace_xfs_attr_fillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level, error;
+
+ trace_xfs_attr_refillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ return 0;
+}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
+
/*========================================================================
* Overall external interface routines.
*========================================================================*/
@@ -114,7 +229,7 @@ xfs_attr_get_ilocked(
if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
- if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+ if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_getvalue(args);
if (xfs_attr_is_leaf(args->dp))
return xfs_attr_leaf_get(args);
@@ -166,7 +281,7 @@ xfs_attr_get(
/*
* Calculate how many blocks we need for the new attribute,
*/
-STATIC int
+int
xfs_attr_calc_size(
struct xfs_da_args *args,
int *local)
@@ -199,6 +314,33 @@ xfs_attr_calc_size(
return nblks;
}
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+ struct xfs_da_args *args,
+ struct xfs_trans_res *tres,
+ unsigned int *total)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+
+ if (args->value) {
+ tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ *total = args->total;
+ } else {
+ *tres = M_RES(mp)->tr_attrrm;
+ *total = XFS_ATTRRM_SPACE_RES(mp);
+ }
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
STATIC int
xfs_attr_try_sf_addname(
struct xfs_inode *dp,
@@ -210,7 +352,7 @@ xfs_attr_try_sf_addname(
/*
* Build initial attribute list (if required).
*/
- if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS)
+ if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS)
xfs_attr_shortform_create(args);
error = xfs_attr_shortform_addname(args);
@@ -230,411 +372,477 @@ xfs_attr_try_sf_addname(
return error;
}
-/*
- * Check to see if the attr should be upgraded from non-existent or shortform to
- * single-leaf-block attribute list.
- */
-static inline bool
-xfs_attr_is_shortform(
- struct xfs_inode *ip)
+static int
+xfs_attr_sf_addname(
+ struct xfs_attr_intent *attr)
{
- return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0);
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ int error = 0;
+
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC) {
+ ASSERT(!error || error == -EEXIST);
+ attr->xattri_dela_state = XFS_DAS_DONE;
+ goto out;
+ }
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block. GROT:
+ * another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args);
+ if (error)
+ return error;
+
+ attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+ trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
}
/*
- * Checks to see if a delayed attribute transaction should be rolled. If so,
- * transaction is finished or rolled as needed.
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
*/
-STATIC int
-xfs_attr_trans_roll(
- struct xfs_delattr_context *dac)
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+ struct xfs_attr_intent *attr,
+ enum xfs_delattr_state replace_state)
{
- struct xfs_da_args *args = dac->da_args;
- int error;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ if (do_replace) {
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ return replace_state;
+ }
+ return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ ASSERT(xfs_attr_is_leaf(args->dp));
+
+ /*
+ * Use the leaf buffer we may already hold locked as a result of
+ * a sf-to-leaf conversion.
+ */
+ error = xfs_attr_leaf_try_add(args);
+
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
- if (dac->flags & XFS_DAC_DEFER_FINISH) {
/*
- * The caller wants us to finish all the deferred ops so that we
- * avoid pinning the log tail with a large number of deferred
- * ops.
+ * We're not in leaf format anymore, so roll the transaction and
+ * retry the add to the newly allocated node block.
*/
- dac->flags &= ~XFS_DAC_DEFER_FINISH;
- error = xfs_defer_finish(&args->trans);
- } else
- error = xfs_trans_roll_inode(&args->trans, args->dp);
+ attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+ goto out;
+ }
+ if (error)
+ return error;
+ /*
+ * We need to commit and roll if we need to allocate remote xattr blocks
+ * or perform more xattr manipulations. Otherwise there is nothing more
+ * to do and we can return success.
+ */
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_LEAF_REPLACE);
+out:
+ trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
/*
- * Set the attribute specified in @args.
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
*/
-int
-xfs_attr_set_args(
- struct xfs_da_args *args)
+static int
+xfs_attr_node_addname(
+ struct xfs_attr_intent *attr)
{
- struct xfs_buf *leaf_bp = NULL;
- int error = 0;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
- do {
- error = xfs_attr_set_iter(&dac, &leaf_bp);
- if (error != -EAGAIN)
- break;
+ error = xfs_attr_node_addname_find_attr(attr);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error) {
- if (leaf_bp)
- xfs_trans_brelse(args->trans, leaf_bp);
+ error = xfs_attr_node_try_addname(attr);
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
return error;
- }
- } while (true);
+ /*
+ * No state change, we really are in node form now
+ * but we need the transaction rolled to continue.
+ */
+ goto out;
+ }
+ if (error)
+ return error;
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_NODE_REPLACE);
+out:
+ trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
-STATIC int
-xfs_attr_sf_addname(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static int
+xfs_attr_rmtval_alloc(
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error = 0;
/*
- * Try to add the attr to the attribute list in the inode.
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
*/
- error = xfs_attr_try_sf_addname(dp, args);
+ if (attr->xattri_blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(attr);
+ if (error)
+ return error;
+ /* Roll the transaction only if there is more to allocate. */
+ if (attr->xattri_blkcnt > 0)
+ goto out;
+ }
- /* Should only be 0, -EEXIST or -ENOSPC */
- if (error != -ENOSPC)
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
return error;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ ++attr->xattri_dela_state);
/*
- * It won't fit in the shortform, transform to a leaf block. GROT:
- * another possible req'mt for a double-split btree op.
+ * If we are not doing a rename, we've finished the operation but still
+ * have to clear the incomplete flag protecting the new attr from
+ * exposing partially initialised state if we crash during creation.
*/
- error = xfs_attr_shortform_to_leaf(args, leaf_bp);
- if (error)
- return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ error = xfs_attr3_leaf_clearflag(args);
+out:
+ trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error;
/*
- * Prevent the leaf buffer from being unlocked so that a concurrent AIL
- * push cannot grab the half-baked leaf buffer and run into problems
- * with the write verifier.
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
*/
- xfs_trans_bhold(args->trans, *leaf_bp);
+ error = xfs_attr_fillstate(state);
+ if (error)
+ return error;
/*
- * We're still in XFS_DAS_UNINIT state here. We've converted
- * the attr fork to leaf format and will restart with the leaf
- * add.
+ * Mark the attribute as INCOMPLETE
*/
- trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp);
- dac->flags |= XFS_DAC_DEFER_FINISH;
- return -EAGAIN;
+ return xfs_attr3_leaf_setflag(args);
+}
+
+/* Ensure the da state of an xattr deferred work item is ready to go. */
+static inline void
+xfs_attr_item_init_da_state(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+
+ if (!attr->xattri_da_state)
+ attr->xattri_da_state = xfs_da_state_alloc(args);
+ else
+ xfs_da_state_reset(attr->xattri_da_state, args);
}
/*
- * Set the attribute specified in @args.
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- * returned.
+ * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
+ * the blocks are valid. Attr keys with remote blocks will be marked
+ * incomplete.
*/
-int
-xfs_attr_set_iter(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static
+int xfs_attr_node_removename_setup(
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
- struct xfs_buf *bp = NULL;
- int forkoff, error = 0;
-
- /* State machine switch */
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- /*
- * If the fork is shortform, attempt to add the attr. If there
- * is no space, this converts to leaf format and returns
- * -EAGAIN with the leaf buffer held across the roll. The caller
- * will deal with a transaction roll error, but otherwise
- * release the hold once we return with a clean transaction.
- */
- if (xfs_attr_is_shortform(dp))
- return xfs_attr_sf_addname(dac, leaf_bp);
- if (*leaf_bp != NULL) {
- xfs_trans_bhold_release(args->trans, *leaf_bp);
- *leaf_bp = NULL;
- }
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state;
+ int error;
- if (xfs_attr_is_leaf(dp)) {
- error = xfs_attr_leaf_try_add(args, *leaf_bp);
- if (error == -ENOSPC) {
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- return error;
-
- /*
- * Finish any deferred work items and roll the
- * transaction once more. The goal here is to
- * call node_addname with the inode and
- * transaction in the same state (inode locked
- * and joined, transaction clean) no matter how
- * we got to this step.
- *
- * At this point, we are still in
- * XFS_DAS_UNINIT, but when we come back, we'll
- * be a node, so we'll fall down into the node
- * handling code below
- */
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- } else if (error) {
- return error;
- }
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ if (error != -EEXIST)
+ goto out;
+ error = 0;
- dac->dela_state = XFS_DAS_FOUND_LBLK;
- } else {
- error = xfs_attr_node_addname_find_attr(dac);
- if (error)
- return error;
+ state = attr->xattri_da_state;
+ ASSERT(state->path.blk[state->path.active - 1].bp != NULL);
+ ASSERT(state->path.blk[state->path.active - 1].magic ==
+ XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr_node_addname(dac);
- if (error)
- return error;
+ error = xfs_attr_leaf_mark_incomplete(args, state);
+ if (error)
+ goto out;
+ if (args->rmtblkno > 0)
+ error = xfs_attr_rmtval_invalidate(args);
+out:
+ if (error) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
+ }
- dac->dela_state = XFS_DAS_FOUND_NBLK;
- }
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FOUND_LBLK:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
+ return error;
+}
- /* Open coded xfs_attr_rmtval_set without trans handling */
- if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) {
- dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT;
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
- }
- }
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int forkoff;
+ int error;
- /*
- * Repeat allocating remote blocks for the attr value until
- * blkcnt drops to zero.
- */
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(dac->dela_state,
- args->dp);
- return -EAGAIN;
- }
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
+ if (error)
+ return error;
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
+ xfs_attr3_leaf_remove(bp, args);
- /*
- * If this is not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- return error;
- }
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
- /*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- return error;
- /*
- * Commit the flag value change and start the next trans in
- * series.
- */
- dac->dela_state = XFS_DAS_FLIP_LFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FLIP_LFLAG:
- /*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
- */
- xfs_attr_restore_rmt_blk(args);
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
+ return error;
+}
- fallthrough;
- case XFS_DAS_RM_LBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_LBLK;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- if (error)
- return error;
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
- dac->dela_state = XFS_DAS_RD_LEAF;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
+ if (!xfs_attr_is_leaf(dp))
+ return 0;
- fallthrough;
- case XFS_DAS_RD_LEAF:
- /*
- * This is the last step for leaf format. Read the block with
- * the old attr, remove the old attr, check for shortform
- * conversion and return.
- */
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
- if (error)
- return error;
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
- xfs_attr3_leaf_remove(bp, args);
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff) {
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ } else {
+ xfs_trans_brelse(args->trans, bp);
+ }
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff)
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
+ return error;
+}
- return error;
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete. Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+ struct xfs_attr_intent *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
- case XFS_DAS_FOUND_NBLK:
- /*
- * Find space for remote blocks and fall into the allocation
- * state.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
+ /* State machine switch */
+next_state:
+ switch (attr->xattri_dela_state) {
+ case XFS_DAS_UNINIT:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ case XFS_DAS_SF_ADD:
+ return xfs_attr_sf_addname(attr);
+ case XFS_DAS_LEAF_ADD:
+ return xfs_attr_leaf_addname(attr);
+ case XFS_DAS_NODE_ADD:
+ return xfs_attr_node_addname(attr);
+
+ case XFS_DAS_SF_REMOVE:
+ error = xfs_attr_sf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_LEAF_REMOVE:
+ error = xfs_attr_leaf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_NODE_REMOVE:
+ error = xfs_attr_node_removename_setup(attr);
+ if (error == -ENOATTR &&
+ (args->op_flags & XFS_DA_OP_RECOVERY)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ error = 0;
+ break;
}
+ if (error)
+ return error;
+ attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+ if (args->rmtblkno == 0)
+ attr->xattri_dela_state++;
+ break;
+ case XFS_DAS_LEAF_SET_RMT:
+ case XFS_DAS_NODE_SET_RMT:
+ error = xfs_attr_rmtval_find_space(attr);
+ if (error)
+ return error;
+ attr->xattri_dela_state++;
fallthrough;
- case XFS_DAS_ALLOC_NODE:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- dac->dela_state = XFS_DAS_ALLOC_NODE;
- if (args->rmtblkno > 0) {
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
- }
- /*
- * If this was not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- goto out;
- }
+ case XFS_DAS_LEAF_ALLOC_RMT:
+ case XFS_DAS_NODE_ALLOC_RMT:
+ error = xfs_attr_rmtval_alloc(attr);
+ if (error)
+ return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ break;
+ goto next_state;
+ case XFS_DAS_LEAF_REPLACE:
+ case XFS_DAS_NODE_REPLACE:
/*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
+ * We must "flip" the incomplete flags on the "new" and "old"
+ * attribute/value pairs so that one disappears and one appears
+ * atomically.
*/
error = xfs_attr3_leaf_flipflags(args);
if (error)
- goto out;
+ return error;
/*
- * Commit the flag value change and start the next trans in
- * series
+ * We must commit the flag value change now to make it atomic
+ * and then we can start the next trans in series at REMOVE_OLD.
*/
- dac->dela_state = XFS_DAS_FLIP_NFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ attr->xattri_dela_state++;
+ break;
- case XFS_DAS_FLIP_NFLAG:
+ case XFS_DAS_LEAF_REMOVE_OLD:
+ case XFS_DAS_NODE_REMOVE_OLD:
/*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
+ * If we have a remote attr, start the process of removing it
+ * by invalidating any cached buffers.
+ *
+ * If we don't have a remote attr, we skip the remote block
+ * removal state altogether with a second state increment.
*/
xfs_attr_restore_rmt_blk(args);
-
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
-
- fallthrough;
- case XFS_DAS_RM_NBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_NBLK;
if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
-
+ error = xfs_attr_rmtval_invalidate(args);
if (error)
return error;
+ } else {
+ attr->xattri_dela_state++;
+ }
- dac->dela_state = XFS_DAS_CLR_FLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ attr->xattri_dela_state++;
+ goto next_state;
+
+ case XFS_DAS_LEAF_REMOVE_RMT:
+ case XFS_DAS_NODE_REMOVE_RMT:
+ error = xfs_attr_rmtval_remove(attr);
+ if (error == -EAGAIN) {
+ error = 0;
+ break;
}
+ if (error)
+ return error;
- fallthrough;
- case XFS_DAS_CLR_FLAG:
/*
- * The last state for node format. Look up the old attr and
- * remove it.
+ * We've finished removing the remote attr blocks, so commit the
+ * transaction and move on to removing the attr name from the
+ * leaf/node block. Removing the attr might require a full
+ * transaction reservation for btree block freeing, so we
+ * can't do that in the same transaction where we removed the
+ * remote attr blocks.
*/
- error = xfs_attr_node_addname_clear_incomplete(dac);
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_REMOVE_ATTR:
+ error = xfs_attr_leaf_remove_attr(attr);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+
+ case XFS_DAS_NODE_REMOVE_ATTR:
+ error = xfs_attr_node_remove_attr(attr);
+ if (!error)
+ error = xfs_attr_leaf_shrink(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
break;
default:
ASSERT(0);
break;
}
-out:
+
+ trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
return error;
}
@@ -648,12 +856,13 @@ xfs_attr_lookup(
{
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL;
+ struct xfs_da_state *state;
int error;
if (!xfs_inode_hasattr(dp))
return -ENOATTR;
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_sf_findname(args, NULL, NULL);
if (xfs_attr_is_leaf(dp)) {
@@ -665,33 +874,85 @@ xfs_attr_lookup(
return error;
}
- return xfs_attr_node_hasname(args, NULL);
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
+ xfs_da_state_free(state);
+ return error;
}
-/*
- * Remove the attribute specified in @args.
- */
-int
-xfs_attr_remove_args(
+static int
+xfs_attr_intent_init(
+ struct xfs_da_args *args,
+ unsigned int op_flags, /* op flag (set or remove) */
+ struct xfs_attr_intent **attr) /* new xfs_attr_intent */
+{
+
+ struct xfs_attr_intent *new;
+
+ new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ new->xattri_op_flags = op_flags;
+ new->xattri_da_args = args;
+
+ *attr = new;
+ return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
struct xfs_da_args *args)
{
- int error;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_attr_intent *new;
+ int error = 0;
- do {
- error = xfs_attr_remove_iter(&dac);
- if (error != -EAGAIN)
- break;
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error)
- return error;
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
- } while (true);
+ return 0;
+}
- return error;
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_intent *new;
+ int error = 0;
+
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
+
+ return 0;
+}
+
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+ struct xfs_da_args *args)
+{
+
+ struct xfs_attr_intent *new;
+ int error;
+
+ error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
+
+ return 0;
}
/*
@@ -724,21 +985,21 @@ xfs_attr_set(
/*
* We have no control over the attribute names that userspace passes us
* to remove, so we have to allow the name lookup prior to attribute
- * removal to fail as well.
+ * removal to fail as well. Preserve the logged flag, since we need
+ * to pass that through to the logging code.
*/
- args->op_flags = XFS_DA_OP_OKNOENT;
+ args->op_flags = XFS_DA_OP_OKNOENT |
+ (args->op_flags & XFS_DA_OP_LOGGED);
if (args->value) {
XFS_STATS_INC(mp, xs_attr_set);
-
- args->op_flags |= XFS_DA_OP_ADDNAME;
args->total = xfs_attr_calc_size(args, &local);
/*
* If the inode doesn't have an attribute fork, add one.
* (inode must not be locked when we call this routine)
*/
- if (XFS_IFORK_Q(dp) == 0) {
+ if (xfs_inode_has_attr_fork(dp) == 0) {
int sf_size = sizeof(struct xfs_attr_sf_hdr) +
xfs_attr_sf_entsize_byname(args->namelen,
args->valuelen);
@@ -748,20 +1009,10 @@ xfs_attr_set(
return error;
}
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres *
- args->total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- total = args->total;
-
if (!local)
rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
} else {
XFS_STATS_INC(mp, xs_attr_remove);
-
- tres = M_RES(mp)->tr_attrrm;
- total = XFS_ATTRRM_SPACE_RES(mp);
rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
}
@@ -769,6 +1020,7 @@ xfs_attr_set(
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
+ xfs_init_attr_trans(args, &tres, &total);
error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
return error;
@@ -776,33 +1028,43 @@ xfs_attr_set(
if (args->value || xfs_inode_hasattr(dp)) {
error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(args->trans, dp,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
error = xfs_attr_lookup(args);
- if (args->value) {
- if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
- goto out_trans_cancel;
- if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_trans_cancel;
- if (error != -ENOATTR && error != -EEXIST)
+ switch (error) {
+ case -EEXIST:
+ /* if no value, we are performing a remove operation */
+ if (!args->value) {
+ error = xfs_attr_defer_remove(args);
+ break;
+ }
+ /* Pure create fails if the attr already exists */
+ if (args->attr_flags & XATTR_CREATE)
goto out_trans_cancel;
- error = xfs_attr_set_args(args);
- if (error)
- goto out_trans_cancel;
- /* shortform attribute has already been committed */
- if (!args->trans)
- goto out_unlock;
- } else {
- if (error != -EEXIST)
+ error = xfs_attr_defer_replace(args);
+ break;
+ case -ENOATTR:
+ /* Can't remove what isn't there. */
+ if (!args->value)
goto out_trans_cancel;
- error = xfs_attr_remove_args(args);
- if (error)
+ /* Pure replace fails if no existing attr to replace. */
+ if (args->attr_flags & XATTR_REPLACE)
goto out_trans_cancel;
+
+ error = xfs_attr_defer_add(args);
+ break;
+ default:
+ goto out_trans_cancel;
}
+ if (error)
+ goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
@@ -837,7 +1099,7 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
{
struct xfs_attr_shortform *sf;
- sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
return be16_to_cpu(sf->hdr.totsize);
}
@@ -845,28 +1107,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
* Add a name to the shortform attribute list structure
* This is the external routine.
*/
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
+static int
+xfs_attr_shortform_addname(
+ struct xfs_da_args *args)
{
- int newsize, forkoff, retval;
+ int newsize, forkoff;
+ int error;
trace_xfs_attr_sf_addname(args);
- retval = xfs_attr_shortform_lookup(args);
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- return retval;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
- return retval;
- retval = xfs_attr_sf_removename(args);
- if (retval)
- return retval;
+ error = xfs_attr_shortform_lookup(args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ return error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ return error;
+
+ error = xfs_attr_sf_removename(args);
+ if (error)
+ return error;
+
/*
- * Since we have removed the old attr, clear ATTR_REPLACE so
- * that the leaf format add routine won't trip over the attr
- * not being around.
+ * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+ * so that the new attr doesn't fit in shortform format, the
+ * leaf format add routine won't trip over the attr not being
+ * around.
*/
- args->attr_flags &= ~XATTR_REPLACE;
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ break;
+ case 0:
+ break;
+ default:
+ return error;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
@@ -889,8 +1164,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
* External routines when attribute list is one block
*========================================================================*/
-/* Store info about a remote block */
-STATIC void
+/* Save the current remote block info and clear the current pointers. */
+static void
xfs_attr_save_rmt_blk(
struct xfs_da_args *args)
{
@@ -899,10 +1174,13 @@ xfs_attr_save_rmt_blk(
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
args->rmtvaluelen2 = args->rmtvaluelen;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/* Set stored info about a remote block */
-STATIC void
+static void
xfs_attr_restore_rmt_blk(
struct xfs_da_args *args)
{
@@ -925,48 +1203,47 @@ xfs_attr_restore_rmt_blk(
*/
STATIC int
xfs_attr_leaf_try_add(
- struct xfs_da_args *args,
- struct xfs_buf *bp)
+ struct xfs_da_args *args)
{
- int retval;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
/*
- * Look up the given attribute in the leaf block. Figure out if
- * the given flags produce an error or call for an atomic rename.
+ * Look up the xattr name to set the insertion point for the new xattr.
*/
- retval = xfs_attr_leaf_hasname(args, &bp);
- if (retval != -ENOATTR && retval != -EEXIST)
- return retval;
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_brelse;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ error = xfs_attr3_leaf_lookup_int(bp, args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto out_brelse;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto out_brelse;
trace_xfs_attr_leaf_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
- xfs_attr_save_rmt_blk(args);
-
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto out_brelse;
}
- /*
- * Add the attribute to the leaf block
- */
return xfs_attr3_leaf_add(bp, args);
out_brelse:
xfs_trans_brelse(args->trans, bp);
- return retval;
+ return error;
}
/*
@@ -1012,9 +1289,10 @@ xfs_attr_leaf_removename(
dp = args->dp;
error = xfs_attr_leaf_hasname(args, &bp);
-
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
return error;
} else if (error != -EEXIST)
return error;
@@ -1062,32 +1340,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
return error;
}
-/*
- * Return EEXIST if attr is found, or ENOATTR if not
- * statep: If not null is set to point at the found state. Caller will
- * be responsible for freeing the state in this case.
- */
+/* Return EEXIST if attr is found, or ENOATTR if not. */
STATIC int
-xfs_attr_node_hasname(
+xfs_attr_node_lookup(
struct xfs_da_args *args,
- struct xfs_da_state **statep)
+ struct xfs_da_state *state)
{
- struct xfs_da_state *state;
int retval, error;
- state = xfs_da_state_alloc(args);
- if (statep != NULL)
- *statep = state;
-
/*
* Search to see if name exists, and get back a pointer to it.
*/
error = xfs_da3_node_lookup_int(state, &retval);
if (error)
- retval = error;
-
- if (!statep)
- xfs_da_state_free(state);
+ return error;
return retval;
}
@@ -1098,46 +1364,48 @@ xfs_attr_node_hasname(
STATIC int
xfs_attr_node_addname_find_attr(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- int retval;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- retval = xfs_attr_node_hasname(args, &dac->da_state);
- if (retval != -ENOATTR && retval != -EEXIST)
- goto error;
-
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto error;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ xfs_attr_item_init_da_state(attr);
+ error = xfs_attr_node_lookup(args, attr->xattri_da_state);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto error;
- trace_xfs_attr_node_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
- xfs_attr_save_rmt_blk(args);
+ trace_xfs_attr_node_replace(args);
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto error;
}
return 0;
error:
- if (dac->da_state)
- xfs_da_state_free(dac->da_state);
- return retval;
+ if (attr->xattri_da_state) {
+ xfs_da_state_free(attr->xattri_da_state);
+ attr->xattri_da_state = NULL;
+ }
+ return error;
}
/*
@@ -1146,25 +1414,16 @@ error:
* This will involve walking down the Btree, and may involve splitting
* leaf nodes and even splitting intermediate nodes up to and including
* the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- *
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- *returned.
*/
-STATIC int
-xfs_attr_node_addname(
- struct xfs_delattr_context *dac)
+static int
+xfs_attr_node_try_addname(
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
+ struct xfs_da_state *state = attr->xattri_da_state;
struct xfs_da_state_blk *blk;
int error;
- trace_xfs_attr_node_addname(args);
+ trace_xfs_attr_node_addname(state->args);
blk = &state->path.blk[state->path.active-1];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1175,25 +1434,9 @@ xfs_attr_node_addname(
/*
* Its really a single leaf node, but it had
* out-of-line values so it looked like it *might*
- * have been a b-tree.
+ * have been a b-tree. Let the caller deal with this.
*/
- xfs_da_state_free(state);
- state = NULL;
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- goto out;
-
- /*
- * Now that we have converted the leaf to a node, we can
- * roll the transaction, and try xfs_attr3_leaf_add
- * again on re-entry. No need to set dela_state to do
- * this. dela_state is still unset by this function at
- * this point.
- */
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_node_addname_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
+ goto out;
}
/*
@@ -1205,7 +1448,6 @@ xfs_attr_node_addname(
error = xfs_da3_split(state);
if (error)
goto out;
- dac->flags |= XFS_DAC_DEFER_FINISH;
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1214,148 +1456,12 @@ xfs_attr_node_addname(
}
out:
- if (state)
- xfs_da_state_free(state);
- return error;
-}
-
-
-STATIC int
-xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = NULL;
- int retval = 0;
- int error = 0;
-
- /*
- * Re-find the "old" attribute entry after any split ops. The INCOMPLETE
- * flag means that we will find the "old" attr, not the "new" one.
- */
- args->attr_filter |= XFS_ATTR_INCOMPLETE;
- state = xfs_da_state_alloc(args);
- state->inleaf = 0;
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error)
- goto out;
-
- error = xfs_attr_node_removename(args, state);
-
- /*
- * Check to see if the tree needs to be collapsed.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
- }
- retval = error = 0;
-
-out:
- if (state)
- xfs_da_state_free(state);
- if (error)
- return error;
- return retval;
-}
-
-/*
- * Shrink an attribute from leaf to shortform
- */
-STATIC int
-xfs_attr_node_shrink(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- struct xfs_inode *dp = args->dp;
- int error, forkoff;
- struct xfs_buf *bp;
-
- /*
- * Have to get rid of the copy of this dabuf in the state.
- */
- ASSERT(state->path.active == 1);
- ASSERT(state->path.blk[0].bp);
- state->path.blk[0].bp = NULL;
-
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
- if (error)
- return error;
-
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- } else
- xfs_trans_brelse(args->trans, bp);
-
- return error;
-}
-
-/*
- * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
- * for later deletion of the entry.
- */
-STATIC int
-xfs_attr_leaf_mark_incomplete(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- int error;
-
- /*
- * Fill in disk block numbers in the state structure
- * so that we can get the buffers back after we commit
- * several transactions in the following calls.
- */
- error = xfs_attr_fillstate(state);
- if (error)
- return error;
-
- /*
- * Mark the attribute as INCOMPLETE
- */
- return xfs_attr3_leaf_setflag(args);
-}
-
-/*
- * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
- * the blocks are valid. Attr keys with remote blocks will be marked
- * incomplete.
- */
-STATIC
-int xfs_attr_node_removename_setup(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state **state = &dac->da_state;
- int error;
-
- error = xfs_attr_node_hasname(args, state);
- if (error != -EEXIST)
- goto out;
- error = 0;
-
- ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
- ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
- XFS_ATTR_LEAF_MAGIC);
-
- if (args->rmtblkno > 0) {
- error = xfs_attr_leaf_mark_incomplete(args, *state);
- if (error)
- goto out;
-
- error = xfs_attr_rmtval_invalidate(args);
- }
-out:
- if (error)
- xfs_da_state_free(*state);
-
+ xfs_da_state_free(state);
+ attr->xattri_da_state = NULL;
return error;
}
-STATIC int
+static int
xfs_attr_node_removename(
struct xfs_da_args *args,
struct xfs_da_state *state)
@@ -1374,246 +1480,42 @@ xfs_attr_node_removename(
return retval;
}
-/*
- * Remove the attribute specified in @args.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * This routine is meant to function as either an in-line or delayed operation,
- * and may return -EAGAIN when the transaction needs to be rolled. Calling
- * functions will need to handle this, and call the function until a
- * successful error code is returned.
- */
-int
-xfs_attr_remove_iter(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
- int retval, error = 0;
- struct xfs_inode *dp = args->dp;
-
- trace_xfs_attr_node_removename(args);
-
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- if (!xfs_inode_hasattr(dp))
- return -ENOATTR;
-
- /*
- * Shortform or leaf formats don't require transaction rolls and
- * thus state transitions. Call the right helper and return.
- */
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_sf_removename(args);
-
- if (xfs_attr_is_leaf(dp))
- return xfs_attr_leaf_removename(args);
-
- /*
- * Node format may require transaction rolls. Set up the
- * state context and fall into the state machine.
- */
- if (!dac->da_state) {
- error = xfs_attr_node_removename_setup(dac);
- if (error)
- return error;
- state = dac->da_state;
- }
-
- fallthrough;
- case XFS_DAS_RMTBLK:
- dac->dela_state = XFS_DAS_RMTBLK;
-
- /*
- * If there is an out-of-line value, de-allocate the blocks.
- * This is done before we remove the attribute so that we don't
- * overflow the maximum size of a transaction and/or hit a
- * deadlock.
- */
- if (args->rmtblkno > 0) {
- /*
- * May return -EAGAIN. Roll and repeat until all remote
- * blocks are removed.
- */
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN) {
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return error;
- } else if (error) {
- goto out;
- }
-
- /*
- * Refill the state structure with buffers (the prior
- * calls released our buffers) and close out this
- * transaction before proceeding.
- */
- ASSERT(args->rmtblkno == 0);
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- dac->dela_state = XFS_DAS_RM_NAME;
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_NAME:
- /*
- * If we came here fresh from a transaction roll, reattach all
- * the buffers to the current transaction.
- */
- if (dac->dela_state == XFS_DAS_RM_NAME) {
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- }
-
- retval = xfs_attr_node_removename(args, state);
-
- /*
- * Check to see if the tree needs to be collapsed. If so, roll
- * the transacton and fall into the shrink state.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
-
- dac->flags |= XFS_DAC_DEFER_FINISH;
- dac->dela_state = XFS_DAS_RM_SHRINK;
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_SHRINK:
- /*
- * If the result is small enough, push it all into the inode.
- * This is our final state so it's safe to return a dirty
- * transaction.
- */
- if (xfs_attr_is_leaf(dp))
- error = xfs_attr_node_shrink(args, state);
- ASSERT(error != -EAGAIN);
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
- goto out;
- }
-out:
- if (state)
- xfs_da_state_free(state);
- return error;
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
+static int
+xfs_attr_node_remove_attr(
+ struct xfs_attr_intent *attr)
{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level;
-
- trace_xfs_attr_fillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state = xfs_da_state_alloc(args);
+ int retval = 0;
+ int error = 0;
/*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
+ * The attr we are removing has already been marked incomplete, so
+ * we need to set the filter appropriately to re-find the "old"
+ * attribute entry after any split ops.
*/
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level, error;
-
- trace_xfs_attr_refillstate(state->args);
+ args->attr_filter |= XFS_ATTR_INCOMPLETE;
+ error = xfs_da3_node_lookup_int(state, &retval);
+ if (error)
+ goto out;
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
+ error = xfs_attr_node_removename(args, state);
/*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
+ * Check to see if the tree needs to be collapsed.
*/
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
+ if (retval && (state->path.active > 1)) {
+ error = xfs_da3_join(state);
+ if (error)
+ goto out;
}
+ retval = error = 0;
- return 0;
+out:
+ xfs_da_state_free(state);
+ if (error)
+ return error;
+ return retval;
}
/*
@@ -1639,7 +1541,8 @@ xfs_attr_node_get(
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_attr_node_hasname(args, &state);
+ state = xfs_da_state_alloc(args);
+ error = xfs_attr_node_lookup(args, state);
if (error != -EEXIST)
goto out_release;
@@ -1653,13 +1556,12 @@ xfs_attr_node_get(
* If not in a transaction, we have to release all the buffers.
*/
out_release:
- for (i = 0; state != NULL && i < state->path.active; i++) {
+ for (i = 0; i < state->path.active; i++) {
xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
- if (state)
- xfs_da_state_free(state);
+ xfs_da_state_free(state);
return error;
}
@@ -1679,3 +1581,20 @@ xfs_attr_namecheck(
/* There shouldn't be any nulls here */
return !memchr(name, 0, length);
}
+
+int __init
+xfs_attr_intent_init_cache(void)
+{
+ xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent",
+ sizeof(struct xfs_attr_intent),
+ 0, 0, NULL);
+
+ return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attr_intent_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attr_intent_cache);
+ xfs_attr_intent_cache = NULL;
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 5e71f719bdd5..81be9b3e4004 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -425,7 +425,7 @@ struct xfs_attr_list_context {
*/
/*
- * Enum values for xfs_delattr_context.da_state
+ * Enum values for xfs_attr_intent.xattri_da_state
*
* These values are used by delayed attribute operations to keep track of where
* they were before they returned -EAGAIN. A return code of -EAGAIN signals the
@@ -434,46 +434,102 @@ struct xfs_attr_list_context {
* to where it was and resume executing where it left off.
*/
enum xfs_delattr_state {
- XFS_DAS_UNINIT = 0, /* No state has been set yet */
- XFS_DAS_RMTBLK, /* Removing remote blks */
- XFS_DAS_RM_NAME, /* Remove attr name */
- XFS_DAS_RM_SHRINK, /* We are shrinking the tree */
- XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */
- XFS_DAS_FOUND_NBLK, /* We found node blk for attr */
- XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */
- XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */
- XFS_DAS_RD_LEAF, /* Read in the new leaf */
- XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */
- XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */
- XFS_DAS_RM_NBLK, /* A rename is removing node blocks */
- XFS_DAS_CLR_FLAG, /* Clear incomplete flag */
+ XFS_DAS_UNINIT = 0, /* No state has been set yet */
+
+ /*
+ * Initial sequence states. The replace setup code relies on the
+ * ADD and REMOVE states for a specific format to be sequential so
+ * that we can transform the initial operation to be performed
+ * according to the xfs_has_larp() state easily.
+ */
+ XFS_DAS_SF_ADD, /* Initial sf add state */
+ XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */
+
+ XFS_DAS_LEAF_ADD, /* Initial leaf add state */
+ XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */
+
+ XFS_DAS_NODE_ADD, /* Initial node add state */
+ XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */
+
+ /* Leaf state set/replace/remove sequence */
+ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */
+ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */
+ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */
+ XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */
+
+ /* Node state sequence, must match leaf state above */
+ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */
+ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */
+ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */
+ XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */
+
+ XFS_DAS_DONE, /* finished operation */
};
-/*
- * Defines for xfs_delattr_context.flags
- */
-#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */
-#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/
+#define XFS_DAS_STRINGS \
+ { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \
+ { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \
+ { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \
+ { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \
+ { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \
+ { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \
+ { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \
+ { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \
+ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \
+ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \
+ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \
+ { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \
+ { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \
+ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \
+ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \
+ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \
+ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \
+ { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \
+ { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \
+ { XFS_DAS_DONE, "XFS_DAS_DONE" }
+
+struct xfs_attri_log_nameval;
/*
* Context used for keeping track of delayed attribute operations
*/
-struct xfs_delattr_context {
- struct xfs_da_args *da_args;
-
- /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
- struct xfs_bmbt_irec map;
- xfs_dablk_t lblkno;
- int blkcnt;
+struct xfs_attr_intent {
+ /*
+ * used to log this item to an intent containing a list of attrs to
+ * commit later
+ */
+ struct list_head xattri_list;
/* Used in xfs_attr_node_removename to roll through removing blocks */
- struct xfs_da_state *da_state;
+ struct xfs_da_state *xattri_da_state;
+
+ struct xfs_da_args *xattri_da_args;
+
+ /*
+ * Shared buffer containing the attr name and value so that the logging
+ * code can share large memory buffers between log items.
+ */
+ struct xfs_attri_log_nameval *xattri_nameval;
/* Used to keep track of current state of delayed operation */
- unsigned int flags;
- enum xfs_delattr_state dela_state;
+ enum xfs_delattr_state xattri_dela_state;
+
+ /*
+ * Attr operation being performed - XFS_ATTRI_OP_FLAGS_*
+ */
+ unsigned int xattri_op_flags;
+
+ /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
+ xfs_dablk_t xattri_lblkno;
+ int xattri_blkcnt;
+ struct xfs_bmbt_irec xattri_map;
};
+
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
@@ -489,11 +545,77 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_da_args *args);
int xfs_attr_get(struct xfs_da_args *args);
int xfs_attr_set(struct xfs_da_args *args);
-int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_remove_iter(struct xfs_delattr_context *dac);
+int xfs_attr_set_iter(struct xfs_attr_intent *attr);
+int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
bool xfs_attr_namecheck(const void *name, size_t length);
-void xfs_delattr_context_init(struct xfs_delattr_context *dac,
- struct xfs_da_args *args);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+ unsigned int *total);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+ struct xfs_inode *ip)
+{
+ return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_af.if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+ /*
+ * When called from the completion of a attr remove to determine the
+ * next state, the attribute fork may be null. This can occur only occur
+ * on a pure remove, but we grab the next state before we check if a
+ * replace operation is being performed. If we are called from any other
+ * context, i_af is guaranteed to exist. Hence if the attr fork is
+ * null, we were called from a pure remove operation and so we are done.
+ */
+ if (!xfs_inode_has_attr_fork(args->dp))
+ return XFS_DAS_DONE;
+
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_ADD;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_ADD;
+ return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_REMOVE;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_REMOVE;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_REMOVE;
+ return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+ if (args->op_flags & XFS_DA_OP_LOGGED)
+ return xfs_attr_init_remove_state(args);
+ return xfs_attr_init_add_state(args);
+}
+
+extern struct kmem_cache *xfs_attr_intent_cache;
+int __init xfs_attr_intent_init_cache(void);
+void xfs_attr_intent_destroy_cache(void);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 014daa8c542d..beee51ad75ce 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -28,6 +28,7 @@
#include "xfs_dir2.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_errortag.h"
/*
@@ -288,6 +289,23 @@ xfs_attr3_leaf_verify_entry(
return NULL;
}
+/*
+ * Validate an attribute leaf block.
+ *
+ * Empty leaf blocks can occur under the following circumstances:
+ *
+ * 1. setxattr adds a new extended attribute to a file;
+ * 2. The file has zero existing attributes;
+ * 3. The attribute is too large to fit in the attribute fork;
+ * 4. The attribute is small enough to fit in a leaf block;
+ * 5. A log flush occurs after committing the transaction that creates
+ * the (empty) leaf block; and
+ * 6. The filesystem goes down after the log flush but before the new
+ * attribute can be committed to the leaf block.
+ *
+ * Hence we need to ensure that we don't fail the validation purely
+ * because the leaf is empty.
+ */
static xfs_failaddr_t
xfs_attr3_leaf_verify(
struct xfs_buf *bp)
@@ -445,6 +463,14 @@ xfs_attr3_leaf_read(
* Namespace helper routines
*========================================================================*/
+/*
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
+ */
static bool
xfs_attr_match(
struct xfs_da_args *args,
@@ -452,14 +478,18 @@ xfs_attr_match(
unsigned char *name,
int flags)
{
+
if (args->namelen != namelen)
return false;
if (memcmp(args->name, name, namelen) != 0)
return false;
- /*
- * If we are looking for incomplete entries, show only those, else only
- * show complete entries.
- */
+
+ /* Recovery ignores the INCOMPLETE flag. */
+ if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+ args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return true;
+
+ /* All remaining matches need to be filtered by INCOMPLETE state. */
if (args->attr_filter !=
(flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
return false;
@@ -560,7 +590,7 @@ xfs_attr_shortform_bytesfit(
* to real extents, or the delalloc conversion will take care of the
* literal area rebalancing.
*/
- if (bytes <= XFS_IFORK_ASIZE(dp))
+ if (bytes <= xfs_inode_attr_fork_size(dp))
return dp->i_forkoff;
/*
@@ -652,7 +682,7 @@ xfs_attr_shortform_create(
struct xfs_da_args *args)
{
struct xfs_inode *dp = args->dp;
- struct xfs_ifork *ifp = dp->i_afp;
+ struct xfs_ifork *ifp = &dp->i_af;
struct xfs_attr_sf_hdr *hdr;
trace_xfs_attr_sf_create(args);
@@ -689,7 +719,7 @@ xfs_attr_sf_findname(
int end;
int i;
- sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
sfe = &sf->list[0];
end = sf->hdr.count;
for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
@@ -734,7 +764,7 @@ xfs_attr_shortform_add(
mp = dp->i_mount;
dp->i_forkoff = forkoff;
- ifp = dp->i_afp;
+ ifp = &dp->i_af;
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
@@ -767,11 +797,9 @@ xfs_attr_fork_remove(
struct xfs_inode *ip,
struct xfs_trans *tp)
{
- ASSERT(ip->i_afp->if_nextents == 0);
+ ASSERT(ip->i_af.if_nextents == 0);
- xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_cache, ip->i_afp);
- ip->i_afp = NULL;
+ xfs_ifork_zap_attr(ip);
ip->i_forkoff = 0;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -795,9 +823,17 @@ xfs_attr_sf_removename(
dp = args->dp;
mp = dp->i_mount;
- sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
error = xfs_attr_sf_findname(args, &sfe, &base);
+
+ /*
+ * If we are recovering an operation, finding nothing to
+ * remove is not an error - it just means there was nothing
+ * to clean up.
+ */
+ if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+ return 0;
if (error != -EEXIST)
return error;
size = xfs_attr_sf_entsize(sfe);
@@ -818,7 +854,7 @@ xfs_attr_sf_removename(
totsize -= size;
if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
- !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
@@ -851,7 +887,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
trace_xfs_attr_sf_lookup(args);
- ifp = args->dp->i_afp;
+ ifp = &args->dp->i_af;
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
sfe = &sf->list[0];
@@ -879,8 +915,8 @@ xfs_attr_shortform_getvalue(
struct xfs_attr_sf_entry *sfe;
int i;
- ASSERT(args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+ ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
sfe = xfs_attr_sf_nextentry(sfe), i++) {
@@ -892,14 +928,10 @@ xfs_attr_shortform_getvalue(
return -ENOATTR;
}
-/*
- * Convert from using the shortform to the leaf. On success, return the
- * buffer so that we can keep it locked until we're totally done with it.
- */
+/* Convert from using the shortform to the leaf format. */
int
xfs_attr_shortform_to_leaf(
- struct xfs_da_args *args,
- struct xfs_buf **leaf_bp)
+ struct xfs_da_args *args)
{
struct xfs_inode *dp;
struct xfs_attr_shortform *sf;
@@ -914,7 +946,7 @@ xfs_attr_shortform_to_leaf(
trace_xfs_attr_sf_to_leaf(args);
dp = args->dp;
- ifp = dp->i_afp;
+ ifp = &dp->i_af;
sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
@@ -961,7 +993,6 @@ xfs_attr_shortform_to_leaf(
sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
- *leaf_bp = bp;
out:
kmem_free(tmpbuffer);
return error;
@@ -1022,8 +1053,8 @@ xfs_attr_shortform_verify(
int i;
int64_t size;
- ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
+ ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
+ ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = ifp->if_bytes;
@@ -1127,9 +1158,17 @@ xfs_attr3_leaf_to_shortform(
goto out;
if (forkoff == -1) {
- ASSERT(xfs_has_attr2(dp->i_mount));
- ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
- xfs_attr_fork_remove(dp, args->trans);
+ /*
+ * Don't remove the attr fork if this operation is the first
+ * part of a attr replace operations. We're going to add a new
+ * attr immediately, so we need to keep the attr fork around in
+ * this case.
+ */
+ if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+ ASSERT(xfs_has_attr2(dp->i_mount));
+ ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_remove(dp, args->trans);
+ }
goto out;
}
@@ -1189,6 +1228,11 @@ xfs_attr3_leaf_to_node(
trace_xfs_attr_leaf_to_node(args);
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ error = -EIO;
+ goto out;
+ }
+
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
@@ -1486,8 +1530,9 @@ xfs_attr3_leaf_add_work(
entry->flags = args->attr_filter;
if (tmp)
entry->flags |= XFS_ATTR_LOCAL;
- if (args->op_flags & XFS_DA_OP_RENAME) {
- entry->flags |= XFS_ATTR_INCOMPLETE;
+ if (args->op_flags & XFS_DA_OP_REPLACE) {
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
args->index2++;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index efa757f1e912..368f4d9fa1d5 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
- struct xfs_buf **leaf_bp);
+int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_sf_removename(struct xfs_da_args *args);
int xfs_attr_sf_findname(struct xfs_da_args *args,
struct xfs_attr_sf_entry **sfep,
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 83b95be9ded8..d440393b40eb 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -543,6 +543,7 @@ xfs_attr_rmtval_stale(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buf *bp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -550,14 +551,18 @@ xfs_attr_rmtval_stale(
XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
return -EFSCORRUPTED;
- bp = xfs_buf_incore(mp->m_ddev_targp,
+ error = xfs_buf_incore(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map->br_startblock),
- XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
+ XFS_FSB_TO_BB(mp, map->br_blockcount),
+ incore_flags, &bp);
+ if (error) {
+ if (error == -ENOENT)
+ return 0;
+ return error;
}
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
return 0;
}
@@ -568,14 +573,14 @@ xfs_attr_rmtval_stale(
*/
int
xfs_attr_rmtval_find_space(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int error;
- dac->lblkno = 0;
- dac->blkcnt = 0;
+ attr->xattri_lblkno = 0;
+ attr->xattri_blkcnt = 0;
args->rmtblkcnt = 0;
args->rmtblkno = 0;
memset(map, 0, sizeof(struct xfs_bmbt_irec));
@@ -584,8 +589,8 @@ xfs_attr_rmtval_find_space(
if (error)
return error;
- dac->blkcnt = args->rmtblkcnt;
- dac->lblkno = args->rmtblkno;
+ attr->xattri_blkcnt = args->rmtblkcnt;
+ attr->xattri_lblkno = args->rmtblkno;
return 0;
}
@@ -598,17 +603,18 @@ xfs_attr_rmtval_find_space(
*/
int
xfs_attr_rmtval_set_blk(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
struct xfs_inode *dp = args->dp;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int nmap;
int error;
nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno,
- dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+ error = xfs_bmapi_write(args->trans, dp,
+ (xfs_fileoff_t)attr->xattri_lblkno,
+ attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
map, &nmap);
if (error)
return error;
@@ -618,8 +624,8 @@ xfs_attr_rmtval_set_blk(
(map->br_startblock != HOLESTARTBLOCK));
/* roll attribute extent map forwards */
- dac->lblkno += map->br_blockcount;
- dac->blkcnt -= map->br_blockcount;
+ attr->xattri_lblkno += map->br_blockcount;
+ attr->xattri_blkcnt -= map->br_blockcount;
return 0;
}
@@ -673,9 +679,9 @@ xfs_attr_rmtval_invalidate(
*/
int
xfs_attr_rmtval_remove(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error, done;
/*
@@ -695,8 +701,8 @@ xfs_attr_rmtval_remove(
* the parent
*/
if (!done) {
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp);
+ trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+ args->dp);
return -EAGAIN;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index d72eff30ca18..d097ec6c4dc3 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
xfs_buf_flags_t incore_flags);
int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr);
int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
-int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac);
-int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 74198dd82b03..e56723dc9cd5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
xfs_mount_t *mp, /* file system mount structure */
int whichfork) /* data or attr fork */
{
+ uint64_t maxblocks; /* max blocks at this level */
+ xfs_extnum_t maxleafents; /* max leaf entries possible */
int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
int maxrootrecs; /* max records in root block */
int minleafrecs; /* min records in leaf block */
int minnoderecs; /* min records in node block */
int sz; /* root block size */
/*
- * The maximum number of extents in a file, hence the maximum number of
- * leaf entries, is controlled by the size of the on-disk extent count,
- * either a signed 32-bit number for the data fork, or a signed 16-bit
- * number for the attr fork.
+ * The maximum number of extents in a fork, hence the maximum number of
+ * leaf entries, is controlled by the size of the on-disk extent count.
*
* Note that we can no longer assume that if we are in ATTR1 that the
* fork offset of all the inodes will be
@@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
* ATTR2 we have to assume the worst case scenario of a minimum size
* available.
*/
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
+ maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (whichfork == XFS_DATA_FORK)
sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
- } else {
- maxleafents = MAXAEXTNUM;
+ else
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
- }
+
maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ maxblocks = howmany_64(maxleafents, minleafrecs);
for (level = 1; maxblocks > 1; level++) {
if (maxblocks <= maxrootrecs)
maxblocks = 1;
else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ maxblocks = howmany_64(maxblocks, minnoderecs);
}
mp->m_bm_maxlevels[whichfork] = level;
ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
@@ -130,7 +128,7 @@ xfs_bmbt_lookup_first(
*/
static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
return whichfork != XFS_COW_FORK &&
ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
@@ -142,7 +140,7 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
*/
static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
return whichfork != XFS_COW_FORK &&
ifp->if_format == XFS_DINODE_FMT_BTREE &&
@@ -321,7 +319,7 @@ xfs_bmap_check_leaf_extents(
int whichfork) /* data or attr fork */
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block; /* current btree block */
xfs_fsblock_t bno; /* block # of "block" */
struct xfs_buf *bp; /* buffer for "block" */
@@ -468,7 +466,7 @@ error0:
if (bp_release)
xfs_trans_brelse(NULL, bp);
error_norelse:
- xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
__func__, i);
xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -485,7 +483,7 @@ STATIC void
xfs_bmap_validate_ret(
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_bmbt_irec_t *mval,
int nmap,
int ret_nmap)
@@ -540,7 +538,7 @@ xfs_bmap_btree_to_extents(
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
@@ -618,7 +616,7 @@ xfs_bmap_extents_to_btree(
mp = ip->i_mount;
ASSERT(whichfork != XFS_COW_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
/*
@@ -747,7 +745,7 @@ xfs_bmap_local_to_extents_empty(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
@@ -787,7 +785,7 @@ xfs_bmap_local_to_extents(
* So sending the data fork of a regular inode is invalid.
*/
ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
@@ -882,7 +880,7 @@ xfs_bmap_add_attrfork_btree(
mp = ip->i_mount;
- if (XFS_BMAP_BMDR_SPACE(block) <= XFS_IFORK_DSIZE(ip))
+ if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
*flags |= XFS_ILOG_DBROOT;
else {
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
@@ -922,7 +920,7 @@ xfs_bmap_add_attrfork_extents(
int error; /* error return value */
if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
- XFS_IFORK_DSIZE(ip))
+ xfs_inode_data_fork_size(ip))
return 0;
cur = NULL;
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
@@ -953,7 +951,7 @@ xfs_bmap_add_attrfork_local(
{
struct xfs_da_args dargs; /* args for dir/attr code */
- if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+ if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip))
return 0;
if (S_ISDIR(VFS_I(ip)->i_mode)) {
@@ -1025,7 +1023,7 @@ xfs_bmap_add_attrfork(
int logflags; /* logging flags */
int error; /* error return value */
- ASSERT(XFS_IFORK_Q(ip) == 0);
+ ASSERT(xfs_inode_has_attr_fork(ip) == 0);
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -1036,16 +1034,15 @@ xfs_bmap_add_attrfork(
rsvd, &tp);
if (error)
return error;
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
goto trans_cancel;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_bmap_set_attrforkoff(ip, size, &version);
if (error)
goto trans_cancel;
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
logflags = 0;
switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_LOCAL:
@@ -1118,7 +1115,7 @@ xfs_iread_bmbt_block(
xfs_extnum_t num_recs;
xfs_extnum_t j;
int whichfork = cur->bc_ino.whichfork;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
block = xfs_btree_get_block(cur, level, &bp);
@@ -1166,7 +1163,7 @@ xfs_iread_extents(
int whichfork)
{
struct xfs_iread_state ir;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_cur *cur;
int error;
@@ -1210,7 +1207,7 @@ xfs_bmap_first_unused(
xfs_fileoff_t *first_unused, /* unused block */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
xfs_fileoff_t lastaddr = 0;
@@ -1257,7 +1254,7 @@ xfs_bmap_last_before(
xfs_fileoff_t *last_block, /* last block */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
int error;
@@ -1291,7 +1288,7 @@ xfs_bmap_last_extent(
struct xfs_bmbt_irec *rec,
int *is_empty)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_iext_cursor icur;
int error;
@@ -1357,7 +1354,7 @@ xfs_bmap_last_offset(
xfs_fileoff_t *last_block,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec rec;
int is_empty;
int error;
@@ -1391,7 +1388,7 @@ xfs_bmap_add_extent_delay_real(
int whichfork)
{
struct xfs_mount *mp = bma->ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
struct xfs_bmbt_irec *new = &bma->got;
int error; /* error return value */
int i; /* temp state */
@@ -1399,7 +1396,7 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t da_new; /* new count del alloc blocks used */
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
@@ -1452,7 +1449,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -1470,13 +1467,13 @@ xfs_bmap_add_extent_delay_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -1950,14 +1947,14 @@ xfs_bmap_add_extent_unwritten_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_bmbt_irec old;
*logflagsp = 0;
cur = *curp;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(!isnullstartblock(new->br_startblock));
@@ -2000,7 +1997,7 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -2018,13 +2015,13 @@ xfs_bmap_add_extent_unwritten_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2479,10 +2476,10 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(isnullstartblock(new->br_startblock));
/*
@@ -2510,15 +2507,15 @@ xfs_bmap_add_extent_hole_delay(
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN)))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2616,9 +2613,9 @@ xfs_bmap_add_extent_hole_real(
struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new,
int *logflagsp,
- int flags)
+ uint32_t flags)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_btree_cur *cur = *curp;
int error; /* error return value */
@@ -2626,7 +2623,7 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
@@ -2661,17 +2658,17 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_startblock + new->br_blockcount == right.br_startblock &&
new->br_state == right.br_state &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -2906,15 +2903,15 @@ xfs_bmap_extsize_align(
/*
* For large extent hint sizes, the aligned extent might be larger than
- * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
- * the length back under MAXEXTLEN. The outer allocation loops handle
- * short allocation just fine, so it is safe to do this. We only want to
- * do it when we are forced to, though, because it means more allocation
- * operations are required.
+ * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+ * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+ * allocation loops handle short allocation just fine, so it is safe to
+ * do this. We only want to do it when we are forced to, though, because
+ * it means more allocation operations are required.
*/
- while (align_alen > MAXEXTLEN)
+ while (align_alen > XFS_MAX_BMBT_EXTLEN)
align_alen -= extsz;
- ASSERT(align_alen <= MAXEXTLEN);
+ ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
/*
* If the previous block overlaps with this proposed allocation
@@ -3004,9 +3001,9 @@ xfs_bmap_extsize_align(
return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
- /* see MAXEXTLEN handling above */
+ /* see XFS_BMBT_MAX_EXTLEN handling above */
ASSERT(orig_end <= align_off + align_alen ||
- align_alen + extsz > MAXEXTLEN);
+ align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
}
#ifdef DEBUG
@@ -3187,7 +3184,8 @@ xfs_bmap_longest_free_extent(
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK,
+ NULL);
if (error) {
/* Couldn't lock the AGF, so skip this AG. */
if (error == -EAGAIN) {
@@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int n,
- int flags)
+ uint32_t flags)
{
if ((flags & XFS_BMAPI_ENTIRE) ||
got->br_startoff + got->br_blockcount <= obno) {
@@ -3811,7 +3809,7 @@ xfs_bmapi_update_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int *n,
- int flags)
+ uint32_t flags)
{
xfs_bmbt_irec_t *mval = *map;
@@ -3864,11 +3862,11 @@ xfs_bmapi_read(
xfs_filblks_t len,
struct xfs_bmbt_irec *mval,
int *nmap,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
xfs_fileoff_t obno;
xfs_fileoff_t end;
@@ -3961,7 +3959,7 @@ xfs_bmapi_reserve_delalloc(
int eof)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
int error;
@@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
- alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
@@ -4088,7 +4086,7 @@ xfs_bmapi_allocate(
{
struct xfs_mount *mp = bma->ip->i_mount;
int whichfork = xfs_bmapi_whichfork(bma->flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
bma->prev.br_startoff = NULLFILEOFF;
} else {
- bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+ bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
if (!bma->eof)
bma->length = XFS_FILBLKS_MIN(bma->length,
bma->got.br_startoff - bma->offset);
@@ -4184,10 +4182,10 @@ xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
struct xfs_bmbt_irec *mval,
xfs_filblks_t len,
- int flags)
+ uint32_t flags)
{
int whichfork = xfs_bmapi_whichfork(flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
int tmp_logflags = 0;
int error;
@@ -4264,7 +4262,7 @@ xfs_bmapi_minleft(
struct xfs_inode *ip,
int fork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork);
if (tp && tp->t_firstblock != NULLFSBLOCK)
return 0;
@@ -4285,7 +4283,7 @@ xfs_bmapi_finish(
int whichfork,
int error)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork);
if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
ifp->if_format != XFS_DINODE_FMT_EXTENTS)
@@ -4312,7 +4310,7 @@ xfs_bmapi_write(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t bno, /* starting file offs. mapped */
xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
+ uint32_t flags, /* XFS_BMAPI_... */
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
@@ -4324,7 +4322,7 @@ xfs_bmapi_write(
};
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
@@ -4424,8 +4422,8 @@ xfs_bmapi_write(
* xfs_extlen_t and therefore 32 bits. Hence we have to
* check for 32-bit overflows and handle them here.
*/
- if (len > (xfs_filblks_t)MAXEXTLEN)
- bma.length = MAXEXTLEN;
+ if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+ bma.length = XFS_MAX_BMBT_EXTLEN;
else
bma.length = len;
@@ -4505,7 +4503,7 @@ xfs_bmapi_convert_delalloc(
struct iomap *iomap,
unsigned int *seq)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
@@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_iext_count_may_overflow(ip, whichfork,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
bma.got.br_startoff > offset_fsb) {
/*
@@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
bma.ip = ip;
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
- bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+ XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
/*
@@ -4629,7 +4630,7 @@ xfs_bmapi_remap(
xfs_fileoff_t bno,
xfs_filblks_t len,
xfs_fsblock_t startblock,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -4639,9 +4640,9 @@ xfs_bmapi_remap(
int whichfork = xfs_bmapi_whichfork(flags);
int logflags = 0, error;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(len > 0);
- ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
@@ -4796,12 +4797,12 @@ xfs_bmap_del_extent_delay(
struct xfs_bmbt_irec *del)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec new;
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
xfs_filblks_t got_indlen, new_indlen, stolen;
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
int error = 0;
bool isrt;
@@ -4923,10 +4924,10 @@ xfs_bmap_del_extent_cow(
struct xfs_bmbt_irec *del)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec new;
xfs_fileoff_t del_endoff, got_endoff;
- int state = BMAP_COWFORK;
+ uint32_t state = BMAP_COWFORK;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
- int bflags) /* bmapi flags */
+ uint32_t bflags) /* bmapi flags */
{
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
@@ -5015,13 +5016,13 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
uint qfield; /* quota field to update */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(del->br_blockcount > 0);
xfs_iext_get_extent(ifp, icur, &got);
ASSERT(got.br_startoff <= del->br_startoff);
@@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
* Deleting the middle of the extent.
*/
- /*
- * For directories, -ENOSPC is returned since a directory entry
- * remove operation must not fail due to low extent count
- * availability. -ENOSPC will be handled by higher layers of XFS
- * by letting the corresponding empty Data/Free blocks to linger
- * until a future remove operation. Dabtree blocks would be
- * swapped with the last block in the leaf space and then the
- * new last block will be unmapped.
- *
- * The above logic also applies to the source directory entry of
- * a rename operation.
- */
- error = xfs_iext_count_may_overflow(ip, whichfork, 1);
- if (error) {
- ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
- whichfork == XFS_DATA_FORK);
- error = -ENOSPC;
- goto done;
- }
-
old = got;
got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5281,7 +5262,7 @@ __xfs_bunmapi(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t start, /* first file offset deleted */
xfs_filblks_t *rlen, /* i/o: amount remaining */
- int flags, /* misc flags */
+ uint32_t flags, /* misc flags */
xfs_extnum_t nexts) /* number of extents max */
{
struct xfs_btree_cur *cur; /* bmap btree cursor */
@@ -5299,7 +5280,6 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
- xfs_fileoff_t max_len;
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
bool done = false;
@@ -5308,7 +5288,7 @@ __xfs_bunmapi(
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
return -EFSCORRUPTED;
if (xfs_is_shutdown(mp))
@@ -5318,16 +5298,6 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- /*
- * Guesstimate how many blocks we can unmap without running the risk of
- * blowing out the transaction with a mix of EFIs and reflink
- * adjustments.
- */
- if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
- max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
- else
- max_len = len;
-
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
@@ -5366,7 +5336,7 @@ __xfs_bunmapi(
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
- (nexts == 0 || extno < nexts) && max_len > 0) {
+ (nexts == 0 || extno < nexts)) {
/*
* Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
@@ -5400,14 +5370,6 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- /* How much can we safely unmap? */
- if (max_len < del.br_blockcount) {
- del.br_startoff += del.br_blockcount - max_len;
- if (!wasdel)
- del.br_startblock += del.br_blockcount - max_len;
- del.br_blockcount = max_len;
- }
-
if (!isrt)
goto delete;
@@ -5543,7 +5505,6 @@ delete:
if (error)
goto error0;
- max_len -= del.br_blockcount;
end = del.br_startoff - 1;
nodelete:
/*
@@ -5609,7 +5570,7 @@ xfs_bunmapi(
struct xfs_inode *ip,
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_extnum_t nexts,
int *done)
{
@@ -5641,7 +5602,7 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
- (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
return false;
return true;
@@ -5668,7 +5629,7 @@ xfs_bmse_merge(
struct xfs_btree_cur *cur,
int *logflags) /* output */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec new;
xfs_filblks_t blockcount;
int error, i;
@@ -5789,7 +5750,7 @@ xfs_bmap_collapse_extents(
{
int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got, prev;
struct xfs_iext_cursor icur;
@@ -5904,7 +5865,7 @@ xfs_bmap_insert_extents(
{
int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got, next;
struct xfs_iext_cursor icur;
@@ -6004,7 +5965,7 @@ xfs_bmap_split_extent(
xfs_fileoff_t split_fsb)
{
int whichfork = XFS_DATA_FORK;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec new; /* split extent */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 03d9aaf87413..16db95b11589 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -39,7 +39,7 @@ struct xfs_bmalloca {
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
int datatype;/* data type being allocated */
- int flags;
+ uint32_t flags;
};
#define XFS_BMAP_MAX_NMAP 4
@@ -47,17 +47,17 @@ struct xfs_bmalloca {
/*
* Flags for xfs_bmapi_*
*/
-#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
-#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
+#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */
/*
* unwritten extent conversion - this needs write cache flushing and no additional
* allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
* from written to unwritten, otherwise convert from unwritten to written.
*/
-#define XFS_BMAPI_CONVERT 0x040
+#define XFS_BMAPI_CONVERT (1u << 5)
/*
* allocate zeroed extents - this requires all newly allocated user data extents
@@ -65,7 +65,7 @@ struct xfs_bmalloca {
* Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
* during the allocation range to zeroed written extents.
*/
-#define XFS_BMAPI_ZERO 0x080
+#define XFS_BMAPI_ZERO (1u << 6)
/*
* Map the inode offset to the block given in ap->firstblock. Primarily
@@ -75,16 +75,16 @@ struct xfs_bmalloca {
* For bunmapi, this flag unmaps the range without adjusting quota, reducing
* refcount, or freeing the blocks.
*/
-#define XFS_BMAPI_REMAP 0x100
+#define XFS_BMAPI_REMAP (1u << 7)
/* Map something in the CoW fork. */
-#define XFS_BMAPI_COWFORK 0x200
+#define XFS_BMAPI_COWFORK (1u << 8)
/* Skip online discard of freed extents */
-#define XFS_BMAPI_NODISCARD 0x1000
+#define XFS_BMAPI_NODISCARD (1u << 9)
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
-#define XFS_BMAPI_NORMAP 0x2000
+#define XFS_BMAPI_NORMAP (1u << 10)
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w)
(w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
}
-static inline int xfs_bmapi_whichfork(int bmapi_flags)
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
{
if (bmapi_flags & XFS_BMAPI_COWFORK)
return XFS_COW_FORK;
@@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
* Flags for xfs_bmap_add_extent*.
*/
-#define BMAP_LEFT_CONTIG (1 << 0)
-#define BMAP_RIGHT_CONTIG (1 << 1)
-#define BMAP_LEFT_FILLING (1 << 2)
-#define BMAP_RIGHT_FILLING (1 << 3)
-#define BMAP_LEFT_DELAY (1 << 4)
-#define BMAP_RIGHT_DELAY (1 << 5)
-#define BMAP_LEFT_VALID (1 << 6)
-#define BMAP_RIGHT_VALID (1 << 7)
-#define BMAP_ATTRFORK (1 << 8)
-#define BMAP_COWFORK (1 << 9)
+#define BMAP_LEFT_CONTIG (1u << 0)
+#define BMAP_RIGHT_CONTIG (1u << 1)
+#define BMAP_LEFT_FILLING (1u << 2)
+#define BMAP_RIGHT_FILLING (1u << 3)
+#define BMAP_LEFT_DELAY (1u << 4)
+#define BMAP_RIGHT_DELAY (1u << 5)
+#define BMAP_LEFT_VALID (1u << 6)
+#define BMAP_RIGHT_VALID (1u << 7)
+#define BMAP_ATTRFORK (1u << 8)
+#define BMAP_COWFORK (1u << 9)
#define XFS_BMAP_EXT_FLAGS \
{ BMAP_LEFT_CONTIG, "LC" }, \
@@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
int whichfork);
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
+ int *nmap, uint32_t flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
@@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
-static inline int xfs_bmap_fork_to_state(int whichfork)
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
switch (whichfork) {
case XFS_ATTR_FORK:
@@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
- int flags);
+ uint32_t flags);
extern struct kmem_cache *xfs_bmap_intent_cache;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 453309fc85f2..cfa052d40105 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -304,7 +304,7 @@ xfs_bmbt_get_minrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
@@ -322,7 +322,7 @@ xfs_bmbt_get_maxrecs(
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip,
cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
@@ -550,7 +550,7 @@ xfs_bmbt_init_cursor(
struct xfs_inode *ip, /* inode owning the btree */
int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
@@ -564,7 +564,7 @@ xfs_bmbt_init_cursor(
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork);
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
cur->bc_ino.ip = ip;
cur->bc_ino.allocated = 0;
cur->bc_ino.flags = 0;
@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
return xfs_bmbt_block_maxrecs(blocklen, leaf);
}
-/* Compute the max possible height for block mapping btrees. */
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
unsigned int
xfs_bmbt_maxlevels_ondisk(void)
{
@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
/* One extra level for the inode root. */
- return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}
/*
@@ -659,7 +664,7 @@ xfs_bmbt_change_owner(
ASSERT(tp || buffer_list);
ASSERT(!(tp && buffer_list));
- ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
+ ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c1500b238520..4c16c8c31fcb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -52,6 +52,70 @@ xfs_btree_magic(
}
/*
+ * These sibling pointer checks are optimised for null sibling pointers. This
+ * happens a lot, and we don't need to byte swap at runtime if the sibling
+ * pointer is NULL.
+ *
+ * These are explicitly marked at inline because the cost of calling them as
+ * functions instead of inlining them is about 36 bytes extra code per call site
+ * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
+ * two sibling check functions reduces the compiled code size by over 300
+ * bytes.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_fsblock_t fsb,
+ __be64 dsibling)
+{
+ xfs_fsblock_t sibling;
+
+ if (dsibling == cpu_to_be64(NULLFSBLOCK))
+ return NULL;
+
+ sibling = be64_to_cpu(dsibling);
+ if (sibling == fsb)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static inline xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+ struct xfs_perag *pag,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_agblock_t agbno,
+ __be32 dsibling)
+{
+ xfs_agblock_t sibling;
+
+ if (dsibling == cpu_to_be32(NULLAGBLOCK))
+ return NULL;
+
+ sibling = be32_to_cpu(dsibling);
+ if (sibling == agbno)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_agbno(pag, sibling))
+ return __this_address;
+ }
+ return NULL;
+}
+
+/*
* Check a long btree block header. Return the address of the failing check,
* or NULL if everything is ok.
*/
@@ -65,6 +129,8 @@ __xfs_btree_check_lblock(
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb = NULLFSBLOCK;
if (crc) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -83,16 +149,16 @@ __xfs_btree_check_lblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp)
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/* Check a long btree block header. */
@@ -128,8 +194,11 @@ __xfs_btree_check_sblock(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_perag *pag = cur->bc_ag.pag;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_agblock_t agbno = NULLAGBLOCK;
if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -146,16 +215,16 @@ __xfs_btree_check_sblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp)
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+ block->bb_u.s.bb_rightsib);
+ return fa;
}
/* Check a short btree block header. */
@@ -216,7 +285,7 @@ xfs_btree_check_sptr(
{
if (level <= 0)
return false;
- return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
+ return xfs_verify_agbno(cur->bc_ag.pag, agbno);
}
/*
@@ -373,8 +442,14 @@ xfs_btree_del_cursor(
break;
}
+ /*
+ * If we are doing a BMBT update, the number of unaccounted blocks
+ * allocated during this cursor life time should be zero. If it's not
+ * zero, then we should be shut down or on our way to shutdown due to
+ * cancelling a dirty transaction on error.
+ */
ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
- xfs_is_shutdown(cur->bc_mp));
+ xfs_is_shutdown(cur->bc_mp) || error != 0);
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
kmem_free(cur->bc_ops);
if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
@@ -647,7 +722,7 @@ xfs_btree_ifork_ptr(
if (cur->bc_flags & XFS_BTREE_STAGING)
return cur->bc_ino.ifake->if_fork;
- return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
+ return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
}
/*
@@ -751,20 +826,20 @@ xfs_btree_lastrec(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets, /* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
int *last) /* output: last byte offset */
{
int i; /* current bit number */
- int64_t imask; /* mask for current bit number */
+ uint32_t imask; /* mask for current bit number */
ASSERT(fields != 0);
/*
* Find the lowest bit, so the first byte offset.
*/
- for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+ for (i = 0, imask = 1u; ; i++, imask <<= 1) {
if (imask & fields) {
*first = offsets[i];
break;
@@ -773,7 +848,7 @@ xfs_btree_offsets(
/*
* Find the highest bit, so the last byte offset.
*/
- for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
if (imask & fields) {
*last = offsets[i + 1] - 1;
break;
@@ -1456,7 +1531,7 @@ void
xfs_btree_log_block(
struct xfs_btree_cur *cur, /* btree cursor */
struct xfs_buf *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+ uint32_t fields) /* mask of fields: XFS_BB_... */
{
int first; /* first byte offset logged */
int last; /* last byte offset logged */
@@ -3194,7 +3269,7 @@ xfs_btree_insrec(
struct xfs_btree_block *block; /* btree block */
struct xfs_buf *bp; /* buffer for block */
union xfs_btree_ptr nptr; /* new block ptr */
- struct xfs_btree_cur *ncur; /* new btree cursor */
+ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */
union xfs_btree_key nkey; /* new block key */
union xfs_btree_key *lkey;
int optr; /* old key/record index */
@@ -3274,7 +3349,7 @@ xfs_btree_insrec(
#ifdef DEBUG
error = xfs_btree_check_block(cur, block, level, bp);
if (error)
- return error;
+ goto error0;
#endif
/*
@@ -3294,7 +3369,7 @@ xfs_btree_insrec(
for (i = numrecs - ptr; i >= 0; i--) {
error = xfs_btree_debug_check_ptr(cur, pp, i, level);
if (error)
- return error;
+ goto error0;
}
xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
@@ -3379,6 +3454,8 @@ xfs_btree_insrec(
return 0;
error0:
+ if (ncur)
+ xfs_btree_del_cursor(ncur, error);
return error;
}
@@ -3479,7 +3556,7 @@ xfs_btree_kill_iroot(
{
int whichfork = cur->bc_ino.whichfork;
struct xfs_inode *ip = cur->bc_ino.ip;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_block *block;
struct xfs_btree_block *cblock;
union xfs_btree_key *kp;
@@ -4271,6 +4348,21 @@ xfs_btree_visit_block(
if (xfs_btree_ptr_is_null(cur, &rptr))
return -ENOENT;
+ /*
+ * We only visit blocks once in this walk, so we have to avoid the
+ * internal xfs_btree_lookup_get_block() optimisation where it will
+ * return the same block without checking if the right sibling points
+ * back to us and creates a cyclic reference in the btree.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ } else {
+ if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ }
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4445,20 +4537,21 @@ xfs_btree_lblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_fsblock_t fsb;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ block->bb_u.l.bb_rightsib);
+ return fa;
}
/**
@@ -4499,22 +4592,21 @@ xfs_btree_sblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- xfs_agblock_t agno;
+ xfs_agblock_t agbno;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
- agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ block->bb_u.s.bb_leftsib);
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+ block->bb_u.s.bb_rightsib);
+ return fa;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 22d9f411fde6..eef27858a013 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
/*
* For logging record fields.
*/
-#define XFS_BB_MAGIC (1 << 0)
-#define XFS_BB_LEVEL (1 << 1)
-#define XFS_BB_NUMRECS (1 << 2)
-#define XFS_BB_LEFTSIB (1 << 3)
-#define XFS_BB_RIGHTSIB (1 << 4)
-#define XFS_BB_BLKNO (1 << 5)
-#define XFS_BB_LSN (1 << 6)
-#define XFS_BB_UUID (1 << 7)
-#define XFS_BB_OWNER (1 << 8)
+#define XFS_BB_MAGIC (1u << 0)
+#define XFS_BB_LEVEL (1u << 1)
+#define XFS_BB_NUMRECS (1u << 2)
+#define XFS_BB_LEFTSIB (1u << 3)
+#define XFS_BB_RIGHTSIB (1u << 4)
+#define XFS_BB_BLKNO (1u << 5)
+#define XFS_BB_LSN (1u << 6)
+#define XFS_BB_UUID (1u << 7)
+#define XFS_BB_OWNER (1u << 8)
#define XFS_BB_NUM_BITS 5
-#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1)
#define XFS_BB_NUM_BITS_CRC 9
-#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
@@ -345,7 +345,7 @@ xfs_btree_dup_cursor(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets,/* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
@@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
*/
-void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9dc1ecb9713d..e7201dc68f43 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_errortag.h"
/*
* xfs_da_btree.c
@@ -116,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_cache_free(xfs_da_state_cache, state);
}
+void
+xfs_da_state_reset(
+ struct xfs_da_state *state,
+ struct xfs_da_args *args)
+{
+ xfs_da_state_kill_altpath(state);
+ memset(state, 0, sizeof(struct xfs_da_state));
+ state->args = args;
+ state->mp = state->args->dp->i_mount;
+}
+
static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
{
if (whichfork == XFS_DATA_FORK)
@@ -482,6 +494,9 @@ xfs_da3_split(
trace_xfs_da_split(state->args);
+ if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ return -EIO;
+
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
* If we need to insert and there isn't room, split the node, then
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 0faf7d9ac241..ffa3df5b2893 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -30,6 +30,7 @@ struct xfs_da_geometry {
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
+ xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
@@ -76,27 +77,33 @@ typedef struct xfs_da_args {
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
int rmtvaluelen2; /* remote attr value length in bytes */
- int op_flags; /* operation flags */
+ uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
/*
* Operation flags:
*/
-#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
-#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
-#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
-#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
-#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
-#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */
+#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
- { XFS_DA_OP_RENAME, "RENAME" }, \
+ { XFS_DA_OP_REPLACE, "REPLACE" }, \
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_NOTIME, "NOTIME" }
+ { XFS_DA_OP_NOTIME, "NOTIME" }, \
+ { XFS_DA_OP_REMOVE, "REMOVE" }, \
+ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \
+ { XFS_DA_OP_LOGGED, "LOGGED" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -197,7 +204,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
* Utility routines.
*/
-#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
+#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
@@ -220,6 +227,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args);
void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5a49caa5c9df..25e2841084e1 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
* Directory address space divided into sections,
* spaces separated by 32GB.
*/
+#define XFS_DIR2_MAX_SPACES 3
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
@@ -688,10 +689,10 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
/*
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 0805ade2d300..5a321b783398 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -22,6 +22,10 @@
#include "xfs_refcount.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -184,36 +188,61 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
[XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
[XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
};
-static void
+/*
+ * Ensure there's a log intent item associated with this deferred work item if
+ * the operation must be restarted on crash. Returns 1 if there's a log item;
+ * 0 if there isn't; or a negative errno.
+ */
+static int
xfs_defer_create_intent(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp,
bool sort)
{
const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ struct xfs_log_item *lip;
- if (!dfp->dfp_intent)
- dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
- dfp->dfp_count, sort);
+ if (dfp->dfp_intent)
+ return 1;
+
+ lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+ if (!lip)
+ return 0;
+ if (IS_ERR(lip))
+ return PTR_ERR(lip);
+
+ dfp->dfp_intent = lip;
+ return 1;
}
/*
* For each pending item in the intake list, log its intent item and the
* associated extents, then add the entire intake list to the end of
* the pending list.
+ *
+ * Returns 1 if at least one log item was associated with the deferred work;
+ * 0 if there are no log items; or a negative errno.
*/
-STATIC void
+static int
xfs_defer_create_intents(
struct xfs_trans *tp)
{
struct xfs_defer_pending *dfp;
+ int ret = 0;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
+ int ret2;
+
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- xfs_defer_create_intent(tp, dfp, true);
+ ret2 = xfs_defer_create_intent(tp, dfp, true);
+ if (ret2 < 0)
+ return ret2;
+ ret |= ret2;
}
+ return ret;
}
/* Abort all the intents that were committed. */
@@ -449,6 +478,8 @@ xfs_defer_finish_one(
dfp->dfp_count--;
error = ops->finish_item(tp, dfp->dfp_done, li, &state);
if (error == -EAGAIN) {
+ int ret;
+
/*
* Caller wants a fresh transaction; put the work item
* back on the list and log a new log intent item to
@@ -459,7 +490,9 @@ xfs_defer_finish_one(
dfp->dfp_count++;
dfp->dfp_done = NULL;
dfp->dfp_intent = NULL;
- xfs_defer_create_intent(tp, dfp, false);
+ ret = xfs_defer_create_intent(tp, dfp, false);
+ if (ret < 0)
+ error = ret;
}
if (error)
@@ -487,7 +520,7 @@ int
xfs_defer_finish_noroll(
struct xfs_trans **tp)
{
- struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *dfp = NULL;
int error = 0;
LIST_HEAD(dop_pending);
@@ -506,17 +539,24 @@ xfs_defer_finish_noroll(
* of time that any one intent item can stick around in memory,
* pinning the log tail.
*/
- xfs_defer_create_intents(*tp);
- list_splice_init(&(*tp)->t_dfops, &dop_pending);
+ int has_intents = xfs_defer_create_intents(*tp);
- error = xfs_defer_trans_roll(tp);
- if (error)
- goto out_shutdown;
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
- /* Possibly relog intent items to keep the log moving. */
- error = xfs_defer_relog(tp, &dop_pending);
- if (error)
+ if (has_intents < 0) {
+ error = has_intents;
goto out_shutdown;
+ }
+ if (has_intents || dfp) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
+
+ /* Relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+ }
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
@@ -665,13 +705,15 @@ xfs_defer_ops_capture(
if (list_empty(&tp->t_dfops))
return NULL;
+ error = xfs_defer_create_intents(tp);
+ if (error < 0)
+ return ERR_PTR(error);
+
/* Create an object to capture the defer ops. */
dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
INIT_LIST_HEAD(&dfc->dfc_list);
INIT_LIST_HEAD(&dfc->dfc_dfops);
- xfs_defer_create_intents(tp);
-
/* Move the dfops chain and transaction state to the capture struct. */
list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
@@ -748,6 +790,10 @@ xfs_defer_ops_capture_and_commit(
/* If we don't capture anything, commit transaction and exit. */
dfc = xfs_defer_ops_capture(tp);
+ if (IS_ERR(dfc)) {
+ xfs_trans_cancel(tp);
+ return PTR_ERR(dfc);
+ }
if (!dfc)
return xfs_trans_commit(tp);
@@ -774,17 +820,25 @@ xfs_defer_ops_continue(
struct xfs_trans *tp,
struct xfs_defer_resources *dres)
{
+ unsigned int i;
+
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
- /* Lock and join the captured inode to the new transaction. */
+ /* Lock the captured resources to the new transaction. */
if (dfc->dfc_held.dr_inos == 2)
xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
else if (dfc->dfc_held.dr_inos == 1)
xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+ /* Join the captured resources to the new transaction. */
xfs_defer_restore_resources(tp, &dfc->dfc_held);
memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+ dres->dr_bufs = 0;
/* Move captured dfops chain and state to the transaction. */
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -854,7 +908,9 @@ xfs_defer_init_item_caches(void)
error = xfs_extfree_intent_init_cache();
if (error)
goto err;
-
+ error = xfs_attr_intent_init_cache();
+ if (error)
+ goto err;
return 0;
err:
xfs_defer_destroy_item_caches();
@@ -865,6 +921,7 @@ err:
void
xfs_defer_destroy_item_caches(void)
{
+ xfs_attr_intent_destroy_cache();
xfs_extfree_intent_destroy_cache();
xfs_bmap_intent_destroy_cache();
xfs_refcount_intent_destroy_cache();
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7bb8a31ad65b..114a3a4930a3 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -19,6 +19,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ XFS_DEFER_OPS_TYPE_ATTR,
XFS_DEFER_OPS_TYPE_MAX,
};
@@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
/*
* Deferred operation item relogging limits.
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 5f1e4799e8fa..76eedc2756b3 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -150,6 +150,8 @@ xfs_da_mount(
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+ dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+ mp->m_sb.sb_blocklog;
dageo->magicpct = (dageo->blksize * 37) / 100;
/* set up attribute geometry - single fsb only */
@@ -161,6 +163,12 @@ xfs_da_mount(
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+
+ if (xfs_has_large_extent_counts(mp))
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ else
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
dageo->magicpct = (dageo->blksize * 37) / 100;
return 0;
}
@@ -185,7 +193,7 @@ xfs_dir_isempty(
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (dp->i_disk_size == 0) /* might happen during shutdown. */
return 1;
- if (dp->i_disk_size > XFS_IFORK_DSIZE(dp))
+ if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
return 0;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
return !sfp->count;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index df0869bba275..00f960a703b2 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -842,7 +842,7 @@ xfs_dir2_block_removename(
* See if the size as a shortform is good enough.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return 0;
/*
@@ -1055,7 +1055,7 @@ xfs_dir2_leaf_to_block(
* Now see if the resulting block can be shrunken to shortform.
*/
size = xfs_dir2_block_sfsize(dp, hdr, &sfh);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return 0;
return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
@@ -1071,7 +1071,7 @@ xfs_dir2_sf_to_block(
struct xfs_trans *tp = args->trans;
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
struct xfs_da_geometry *geo = args->geo;
xfs_dir2_db_t blkno; /* dir-relative block # (0) */
xfs_dir2_data_hdr_t *hdr; /* block header */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 5a97a87eaa20..003812fd7d35 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -237,7 +237,7 @@ xfs_dir2_block_sfsize(
(i8count ? /* inumber */
count * XFS_INO64_SIZE :
count * XFS_INO32_SIZE);
- if (size > XFS_IFORK_DSIZE(dp))
+ if (size > xfs_inode_data_fork_size(dp))
return size; /* size value is a failure */
}
/*
@@ -406,7 +406,7 @@ xfs_dir2_sf_addname(
* Won't fit as shortform any more (due to size),
* or the pick routine says it won't (due to offset values).
*/
- if (new_isize > XFS_IFORK_DSIZE(dp) ||
+ if (new_isize > xfs_inode_data_fork_size(dp) ||
(pick =
xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
/*
@@ -710,7 +710,7 @@ xfs_dir2_sf_verify(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
struct xfs_dir2_sf_hdr *sfp;
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
@@ -1031,7 +1031,7 @@ xfs_dir2_sf_replace_needblock(
newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
return inum > XFS_DIR2_MAX_SHORT_INUM &&
- sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp);
+ sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp);
}
/*
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index a23a52e643ad..5362908164b0 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -59,7 +59,10 @@
#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
#define XFS_ERRTAG_AG_RESV_FAIL 38
-#define XFS_ERRTAG_MAX 39
+#define XFS_ERRTAG_LARP 39
+#define XFS_ERRTAG_DA_LEAF_SPLIT 40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
+#define XFS_ERRTAG_MAX 42
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -103,5 +106,8 @@
#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
#define XFS_RANDOM_AG_RESV_FAIL 1
+#define XFS_RANDOM_LARP 1
+#define XFS_RANDOM_DA_LEAF_SPLIT 1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index d665c04e69dd..b55bdfa9c8a8 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
XFS_SB_FEAT_INCOMPAT_META_UUID| \
XFS_SB_FEAT_INCOMPAT_BIGTIME| \
- XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+ XFS_SB_FEAT_INCOMPAT_NREXT64)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature(
return (sbp->sb_features_incompat & feature) != 0;
}
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+ (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
@@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features(
sbp->sb_features_log_incompat |= features;
}
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+{
+ return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+}
static inline bool
xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
@@ -525,26 +534,26 @@ typedef struct xfs_agf {
#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
-#define XFS_AGF_MAGICNUM 0x00000001
-#define XFS_AGF_VERSIONNUM 0x00000002
-#define XFS_AGF_SEQNO 0x00000004
-#define XFS_AGF_LENGTH 0x00000008
-#define XFS_AGF_ROOTS 0x00000010
-#define XFS_AGF_LEVELS 0x00000020
-#define XFS_AGF_FLFIRST 0x00000040
-#define XFS_AGF_FLLAST 0x00000080
-#define XFS_AGF_FLCOUNT 0x00000100
-#define XFS_AGF_FREEBLKS 0x00000200
-#define XFS_AGF_LONGEST 0x00000400
-#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_RMAP_BLOCKS 0x00002000
-#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000
-#define XFS_AGF_REFCOUNT_ROOT 0x00008000
-#define XFS_AGF_REFCOUNT_LEVEL 0x00010000
-#define XFS_AGF_SPARE64 0x00020000
+#define XFS_AGF_MAGICNUM (1u << 0)
+#define XFS_AGF_VERSIONNUM (1u << 1)
+#define XFS_AGF_SEQNO (1u << 2)
+#define XFS_AGF_LENGTH (1u << 3)
+#define XFS_AGF_ROOTS (1u << 4)
+#define XFS_AGF_LEVELS (1u << 5)
+#define XFS_AGF_FLFIRST (1u << 6)
+#define XFS_AGF_FLLAST (1u << 7)
+#define XFS_AGF_FLCOUNT (1u << 8)
+#define XFS_AGF_FREEBLKS (1u << 9)
+#define XFS_AGF_LONGEST (1u << 10)
+#define XFS_AGF_BTREEBLKS (1u << 11)
+#define XFS_AGF_UUID (1u << 12)
+#define XFS_AGF_RMAP_BLOCKS (1u << 13)
+#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14)
+#define XFS_AGF_REFCOUNT_ROOT (1u << 15)
+#define XFS_AGF_REFCOUNT_LEVEL (1u << 16)
+#define XFS_AGF_SPARE64 (1u << 17)
#define XFS_AGF_NUM_BITS 18
-#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
{ XFS_AGF_MAGICNUM, "MAGICNUM" }, \
@@ -619,22 +628,22 @@ typedef struct xfs_agi {
#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM (1 << 0)
-#define XFS_AGI_VERSIONNUM (1 << 1)
-#define XFS_AGI_SEQNO (1 << 2)
-#define XFS_AGI_LENGTH (1 << 3)
-#define XFS_AGI_COUNT (1 << 4)
-#define XFS_AGI_ROOT (1 << 5)
-#define XFS_AGI_LEVEL (1 << 6)
-#define XFS_AGI_FREECOUNT (1 << 7)
-#define XFS_AGI_NEWINO (1 << 8)
-#define XFS_AGI_DIRINO (1 << 9)
-#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_MAGICNUM (1u << 0)
+#define XFS_AGI_VERSIONNUM (1u << 1)
+#define XFS_AGI_SEQNO (1u << 2)
+#define XFS_AGI_LENGTH (1u << 3)
+#define XFS_AGI_COUNT (1u << 4)
+#define XFS_AGI_ROOT (1u << 5)
+#define XFS_AGI_LEVEL (1u << 6)
+#define XFS_AGI_FREECOUNT (1u << 7)
+#define XFS_AGI_NEWINO (1u << 8)
+#define XFS_AGI_DIRINO (1u << 9)
+#define XFS_AGI_UNLINKED (1u << 10)
#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define XFS_AGI_FREE_ROOT (1 << 11)
-#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1u << 11)
+#define XFS_AGI_FREE_LEVEL (1u << 12)
+#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */
#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
@@ -695,7 +704,7 @@ struct xfs_agfl {
* When the bigtime feature is enabled, ondisk inode timestamps become an
* unsigned 64-bit nanoseconds counter. This means that the bigtime inode
* timestamp epoch is the start of the classic timestamp range, which is
- * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers
+ * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers
* /must/ use the bigtime conversion functions when encoding and decoding raw
* timestamps.
*/
@@ -791,16 +800,41 @@ struct xfs_dinode {
__be32 di_nlink; /* number of links to file */
__be16 di_projid_lo; /* lower part of owner's project id */
__be16 di_projid_hi; /* higher part owner's project id */
- __u8 di_pad[6]; /* unused, zeroed space */
- __be16 di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ __be64 di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ __be64 di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ __u8 di_v2_pad[6];
+ __be16 di_flushiter;
+ };
+ };
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
__be64 di_size; /* number of bytes in file */
__be64 di_nblocks; /* # of direct & btree blocks used */
__be32 di_extsize; /* basic/minimum extent size for file */
- __be32 di_nextents; /* number of extents in data fork */
- __be16 di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ __be32 di_nextents;
+ __be16 di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ __be32 di_big_anextents;
+ __be16 di_nrext64_pad;
+ } __packed;
+ } __packed;
__u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
__s8 di_aformat; /* format of attr fork's data */
__be32 di_dmevmask; /* DMIG event mask */
@@ -870,6 +904,56 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_UUID, "uuid" }
/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR \
+ min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
+/*
* Inode minimum and maximum sizes.
*/
#define XFS_DINODE_MIN_LOG 8
@@ -918,10 +1002,6 @@ enum xfs_dinode_fmt {
((w) == XFS_DATA_FORK ? \
(dip)->di_format : \
(dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
- ((w) == XFS_DATA_FORK ? \
- be32_to_cpu((dip)->di_nextents) : \
- be16_to_cpu((dip)->di_anextents))
/*
* For block and character special files the 32bit dev_t is stored at the
@@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
}
+static inline bool xfs_dinode_has_large_extent_counts(
+ const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
+
/*
* Inode number format:
* low inopblog bits - offset in block
@@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
-#define XFS_DQTYPE_USER 0x01 /* user dquot record */
-#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */
-#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */
-#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */
+#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */
+#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */
+#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */
+#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */
/* bitmask to determine if this is a user/group/project dquot */
#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
@@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block {
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
/*
* bmbt records have a file offset (block) field that is 54 bits wide, so this
* is the largest xfs_fileoff_t that we ever expect to see.
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 505533c43a92..1cfd5bc6520a 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
/*
* Minimum and maximum sizes need for growth checks.
@@ -377,7 +378,7 @@ struct xfs_bulkstat {
uint32_t bs_extsize_blks; /* extent size hint, blocks */
uint32_t bs_nlink; /* number of links */
- uint32_t bs_extents; /* number of extents */
+ uint32_t bs_extents; /* 32-bit data fork extent counter */
uint32_t bs_aextents; /* attribute number of extents */
uint16_t bs_version; /* structure version */
uint16_t bs_forkoff; /* inode fork offset in bytes */
@@ -386,8 +387,9 @@ struct xfs_bulkstat {
uint16_t bs_checked; /* checked inode metadata */
uint16_t bs_mode; /* type and mode */
uint16_t bs_pad2; /* zeroed */
+ uint64_t bs_extents64; /* 64-bit data fork extent counter */
- uint64_t bs_pad[7]; /* zeroed */
+ uint64_t bs_pad[6]; /* zeroed */
};
#define XFS_BULKSTAT_VERSION_V1 (1)
@@ -459,17 +461,28 @@ struct xfs_bulk_ireq {
* Only return results from the specified @agno. If @ino is zero, start
* with the first inode of @agno.
*/
-#define XFS_BULK_IREQ_AGNO (1 << 0)
+#define XFS_BULK_IREQ_AGNO (1U << 0)
/*
* Return bulkstat information for a single inode, where @ino value is a
* special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
* values below. Not compatible with XFS_BULK_IREQ_AGNO.
*/
-#define XFS_BULK_IREQ_SPECIAL (1 << 1)
+#define XFS_BULK_IREQ_SPECIAL (1U << 1)
-#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
- XFS_BULK_IREQ_SPECIAL)
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL | \
+ XFS_BULK_IREQ_NREXT64)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
@@ -699,34 +712,34 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_NR 25
/* i: Repair this metadata. */
-#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
+#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
/* o: Metadata object needs repair. */
-#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1)
+#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1)
/*
* o: Metadata object could be optimized. It's not corrupt, but
* we could improve on it somehow.
*/
-#define XFS_SCRUB_OFLAG_PREEN (1 << 2)
+#define XFS_SCRUB_OFLAG_PREEN (1u << 2)
/* o: Cross-referencing failed. */
-#define XFS_SCRUB_OFLAG_XFAIL (1 << 3)
+#define XFS_SCRUB_OFLAG_XFAIL (1u << 3)
/* o: Metadata object disagrees with cross-referenced metadata. */
-#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4)
+#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4)
/* o: Scan was not complete. */
-#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5)
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5)
/* o: Metadata object looked funny but isn't corrupt. */
-#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+#define XFS_SCRUB_OFLAG_WARNING (1u << 6)
/*
* o: IFLAG_REPAIR was set but metadata object did not need fixing or
* optimization and has therefore not been altered.
*/
-#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b418fe0c0679..6cdfd64bc56b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -105,7 +105,6 @@ xfs_inobt_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
union xfs_btree_rec *rec;
int error;
uint64_t realfree;
@@ -116,7 +115,7 @@ xfs_inobt_get_rec(
xfs_inobt_btrec_to_irec(mp, rec, irec);
- if (!xfs_verify_agino(mp, agno, irec->ir_startino))
+ if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
goto out_bad_rec;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
irec->ir_count > XFS_INODES_PER_CHUNK)
@@ -137,7 +136,8 @@ xfs_inobt_get_rec(
out_bad_rec:
xfs_warn(mp,
"%s Inode BTree record corruption in AG %d detected!",
- cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno);
+ cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
+ cur->bc_ag.pag->pag_agno);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -1610,7 +1610,7 @@ xfs_dialloc_good_ag(
return false;
if (!pag->pagi_init) {
- error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno);
+ error = xfs_ialloc_read_agi(pag, tp, NULL);
if (error)
return false;
}
@@ -1621,7 +1621,7 @@ xfs_dialloc_good_ag(
return false;
if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags);
+ error = xfs_alloc_read_agf(pag, tp, flags, NULL);
if (error)
return false;
}
@@ -1679,7 +1679,7 @@ xfs_dialloc_try_ag(
* Then read in the AGI buffer and recheck with the AGI buffer
* lock held.
*/
- error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
if (error)
return error;
@@ -2169,7 +2169,7 @@ xfs_difree(
/*
* Get the allocation group header.
*/
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error) {
xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
__func__, error);
@@ -2215,7 +2215,7 @@ xfs_imap_lookup(
int error;
int i;
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
@@ -2414,9 +2414,9 @@ out_drop:
*/
void
xfs_ialloc_log_agi(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* allocation group header buffer */
- int fields) /* bitmask of fields to log */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte number */
int last; /* last byte number */
@@ -2571,47 +2571,48 @@ const struct xfs_buf_ops xfs_agi_buf_ops = {
*/
int
xfs_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* allocation group hdr buf */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
{
+ struct xfs_mount *mp = pag->pag_mount;
int error;
- trace_xfs_read_agi(mp, agno);
+ trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
- ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
+ XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
if (error)
return error;
if (tp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
+ xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
- xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+ xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
return 0;
}
+/*
+ * Read in the agi and initialise the per-ag data. If the caller supplies a
+ * @agibpp, return the locked AGI buffer to them, otherwise release it.
+ */
int
xfs_ialloc_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp) /* allocation group hdr buf */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
+ struct xfs_buf **agibpp)
{
- struct xfs_agi *agi; /* allocation group header */
- struct xfs_perag *pag; /* per allocation group data */
+ struct xfs_buf *agibp;
+ struct xfs_agi *agi;
int error;
- trace_xfs_ialloc_read_agi(mp, agno);
+ trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
- error = xfs_read_agi(mp, tp, agno, bpp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
return error;
- agi = (*bpp)->b_addr;
- pag = (*bpp)->b_pag;
+ agi = agibp->b_addr;
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -2623,27 +2624,11 @@ xfs_ialloc_read_agi(
* we are in the middle of a forced shutdown.
*/
ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- xfs_is_shutdown(mp));
- return 0;
-}
-
-/*
- * Read in the agi to initialise the per-ag data in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno) /* allocation group number */
-{
- struct xfs_buf *bp = NULL;
- int error;
-
- error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
- if (error)
- return error;
- if (bp)
- xfs_trans_brelse(tp, bp);
+ xfs_is_shutdown(pag->pag_mount));
+ if (agibpp)
+ *agibpp = agibp;
+ else
+ xfs_trans_brelse(tp, agibp);
return 0;
}
@@ -2772,6 +2757,8 @@ xfs_ialloc_setup_geometry(
igeo->new_diflags2 = 0;
if (xfs_has_bigtime(mp))
igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ if (xfs_has_large_extent_counts(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
@@ -2910,8 +2897,7 @@ xfs_ialloc_calc_rootino(
* allocation group, or very odd geometries created by old mkfs
* versions on very small filesystems.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0)
+ if (xfs_ag_contains_log(mp, 0))
first_bno += mp->m_sb.sb_logblocks;
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 8b5c2b709022..9bbbca6ac4ed 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -60,27 +60,12 @@ void
xfs_ialloc_log_agi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* allocation group header buffer */
- int fields); /* bitmask of fields to log */
+ uint32_t fields); /* bitmask of fields to log */
-/*
- * Read in the allocation group header (inode allocation section)
- */
-int /* error */
-xfs_ialloc_read_agi(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- struct xfs_buf **bpp); /* allocation group hdr buf */
-
-/*
- * Read in the allocation group header to initialise the per-ag data
- * in the mount structure
- */
-int
-xfs_ialloc_pagi_init(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno); /* allocation group number */
+int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
+int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp,
+ struct xfs_buf **agibpp);
/*
* Lookup a record by ino in the btree given by cur.
@@ -102,8 +87,6 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
-int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, struct xfs_buf **bpp);
union xfs_btree_rec;
void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index b2ad2fdc40f5..8c83e265770c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -683,10 +683,10 @@ xfs_inobt_rec_check_count(
static xfs_extlen_t
xfs_inobt_max_size(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ struct xfs_perag *pag)
{
- xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
+ struct xfs_mount *mp = pag->pag_mount;
+ xfs_agblock_t agblocks = pag->block_count;
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (M_IGEO(mp)->inobt_mxr[0] == 0)
@@ -697,8 +697,7 @@ xfs_inobt_max_size(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
@@ -722,7 +721,7 @@ xfs_inobt_cur(
ASSERT(*agi_bpp == NULL);
ASSERT(*curpp == NULL);
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp);
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
if (error)
return error;
@@ -757,16 +756,15 @@ xfs_inobt_count_blocks(
/* Read finobt block count from AGI header. */
static int
xfs_finobt_read_blocks(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_trans *tp,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp;
struct xfs_agi *agi;
int error;
- error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp);
+ error = xfs_ialloc_read_agi(pag, tp, &agbp);
if (error)
return error;
@@ -794,14 +792,14 @@ xfs_finobt_calc_reserves(
return 0;
if (xfs_has_inobtcounts(mp))
- error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len);
+ error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
&tree_len);
if (error)
return error;
- *ask += xfs_inobt_max_size(mp, pag->pag_agno);
+ *ask += xfs_inobt_max_size(pag);
*used += tree_len;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cae9708c8587..758aacd8166b 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -10,6 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_inode.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
@@ -41,14 +42,12 @@ xfs_inode_buf_verify(
bool readahead)
{
struct xfs_mount *mp = bp->b_mount;
- xfs_agnumber_t agno;
int i;
int ni;
/*
* Validate the magic number and version of every inode in the buffer
*/
- agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) {
struct xfs_dinode *dip;
@@ -59,7 +58,7 @@ xfs_inode_buf_verify(
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- xfs_verify_agino_or_null(mp, agno, unlinked_ino);
+ xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -178,7 +177,6 @@ xfs_inode_from_disk(
xfs_failaddr_t fa;
ASSERT(ip->i_cowfp == NULL);
- ASSERT(ip->i_afp == NULL);
fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from);
if (fa) {
@@ -230,7 +228,8 @@ xfs_inode_from_disk(
ip->i_nblocks = be64_to_cpu(from->di_nblocks);
ip->i_extsize = be32_to_cpu(from->di_extsize);
ip->i_forkoff = from->di_forkoff;
- ip->i_diflags = be16_to_cpu(from->di_flags);
+ ip->i_diflags = be16_to_cpu(from->di_flags);
+ ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked);
if (from->di_dmevmask || from->di_dmstate)
xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS);
@@ -279,6 +278,25 @@ xfs_inode_to_disk_ts(
return ts;
}
+static inline void
+xfs_inode_to_disk_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+ to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af));
+ /*
+ * We might be upgrading the inode to use larger extent counters
+ * than was previously used. Hence zero the unused field.
+ */
+ to->di_nrext64_pad = cpu_to_be16(0);
+ } else {
+ to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af));
+ }
+}
+
void
xfs_inode_to_disk(
struct xfs_inode *ip,
@@ -296,7 +314,6 @@ xfs_inode_to_disk(
to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
- memset(to->di_pad, 0, sizeof(to->di_pad));
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
@@ -307,10 +324,8 @@ xfs_inode_to_disk(
to->di_size = cpu_to_be64(ip->i_disk_size);
to->di_nblocks = cpu_to_be64(ip->i_nblocks);
to->di_extsize = cpu_to_be32(ip->i_extsize);
- to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
- to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
to->di_forkoff = ip->i_forkoff;
- to->di_aformat = xfs_ifork_format(ip->i_afp);
+ to->di_aformat = xfs_ifork_format(&ip->i_af);
to->di_flags = cpu_to_be16(ip->i_diflags);
if (xfs_has_v3inodes(ip->i_mount)) {
@@ -323,11 +338,14 @@ xfs_inode_to_disk(
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_disk_iext_counters(ip, to);
}
static xfs_failaddr_t
@@ -336,20 +354,40 @@ xfs_dinode_verify_fork(
struct xfs_mount *mp,
int whichfork)
{
- uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t di_nextents;
+ xfs_extnum_t max_extents;
+ mode_t mode = be16_to_cpu(dip->di_mode);
+ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork);
+
+ di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+ /*
+ * For fork types that can contain local data, check that the fork
+ * format matches the size of local data contained within the fork.
+ *
+ * For all types, check that when the size says the should be in extent
+ * or btree format, the inode isn't claiming it is in local format.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
- switch (XFS_DFORK_FORMAT(dip, whichfork)) {
+ if (be64_to_cpu(dip->di_size) > fork_size &&
+ fork_format == XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ switch (fork_format) {
case XFS_DINODE_FMT_LOCAL:
/*
- * no local regular files yet
+ * No local regular files yet.
*/
- if (whichfork == XFS_DATA_FORK) {
- if (S_ISREG(be16_to_cpu(dip->di_mode)))
- return __this_address;
- if (be64_to_cpu(dip->di_size) >
- XFS_DFORK_SIZE(dip, mp, whichfork))
- return __this_address;
- }
+ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+ return __this_address;
if (di_nextents)
return __this_address;
break;
@@ -358,12 +396,11 @@ xfs_dinode_verify_fork(
return __this_address;
break;
case XFS_DINODE_FMT_BTREE:
- if (whichfork == XFS_ATTR_FORK) {
- if (di_nextents > MAXAEXTNUM)
- return __this_address;
- } else if (di_nextents > MAXEXTNUM) {
+ max_extents = xfs_iext_max_nextents(
+ xfs_dinode_has_large_extent_counts(dip),
+ whichfork);
+ if (di_nextents > max_extents)
return __this_address;
- }
break;
default:
return __this_address;
@@ -396,6 +433,24 @@ xfs_dinode_verify_forkoff(
return NULL;
}
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip)) {
+ if (!xfs_has_large_extent_counts(mp))
+ return __this_address;
+ if (dip->di_nrext64_pad != 0)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (dip->di_v3_pad != 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -407,6 +462,9 @@ xfs_dinode_verify(
uint16_t flags;
uint64_t flags2;
uint64_t di_size;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
+ xfs_filblks_t nblocks;
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return __this_address;
@@ -437,10 +495,19 @@ xfs_dinode_verify(
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
return __this_address;
+ fa = xfs_dinode_verify_nrext64(mp, dip);
+ if (fa)
+ return fa;
+
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
/* Fork checks carried over from xfs_iformat_fork */
- if (mode &&
- be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
- be64_to_cpu(dip->di_nblocks))
+ if (mode && nextents + naextents > nblocks)
+ return __this_address;
+
+ if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
@@ -497,7 +564,7 @@ xfs_dinode_verify(
default:
return __this_address;
}
- if (dip->di_anextents)
+ if (naextents)
return __this_address;
}
@@ -639,7 +706,7 @@ xfs_inode_validate_extsize(
if (extsize_bytes % blocksize_bytes)
return __this_address;
- if (extsize > MAXEXTLEN)
+ if (extsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
@@ -696,7 +763,7 @@ xfs_inode_validate_cowextsize(
if (cowextsize_bytes % mp->m_sb.sb_blocksize)
return __this_address;
- if (cowextsize > MAXEXTLEN)
+ if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (cowextsize > mp->m_sb.sb_agblocks / 2)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9149f4f796fc..9327a4f39206 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -35,8 +35,8 @@ xfs_init_local_fork(
const void *data,
int64_t size)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int mem_size = size, real_size = 0;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ int mem_size = size;
bool zero_terminate;
/*
@@ -50,8 +50,7 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- real_size = roundup(mem_size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
+ ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
@@ -103,9 +102,9 @@ xfs_iformat_extents(
int whichfork)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
int state = xfs_bmap_fork_to_state(whichfork);
- int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
@@ -117,8 +116,8 @@ xfs_iformat_extents(
* we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
- (unsigned long long) ip->i_ino, nex);
+ xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+ ip->i_ino, nex);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
@@ -174,7 +173,7 @@ xfs_iformat_btree(
int size;
int level;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
size = XFS_BMAP_BROOT_SPACE(mp, dfp);
nrecs = be16_to_cpu(dfp->bb_numrecs);
@@ -230,7 +229,7 @@ xfs_iformat_data_fork(
* depend on it.
*/
ip->i_df.if_format = dip->di_format;
- ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents);
+ ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
@@ -277,17 +276,23 @@ xfs_dfork_attr_shortform_size(
return be16_to_cpu(atp->hdr.totsize);
}
-struct xfs_ifork *
-xfs_ifork_alloc(
+void
+xfs_ifork_init_attr(
+ struct xfs_inode *ip,
enum xfs_dinode_fmt format,
xfs_extnum_t nextents)
{
- struct xfs_ifork *ifp;
+ ip->i_af.if_format = format;
+ ip->i_af.if_nextents = nextents;
+}
- ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL);
- ifp->if_format = format;
- ifp->if_nextents = nextents;
- return ifp;
+void
+xfs_ifork_zap_attr(
+ struct xfs_inode *ip)
+{
+ xfs_idestroy_fork(&ip->i_af);
+ memset(&ip->i_af, 0, sizeof(struct xfs_ifork));
+ ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
}
int
@@ -295,16 +300,16 @@ xfs_iformat_attr_fork(
struct xfs_inode *ip,
struct xfs_dinode *dip)
{
+ xfs_extnum_t naextents = xfs_dfork_attr_extents(dip);
int error = 0;
/*
* Initialize the extent count early, as the per-format routines may
* depend on it.
*/
- ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
- be16_to_cpu(dip->di_anextents));
+ xfs_ifork_init_attr(ip, dip->di_aformat, naextents);
- switch (ip->i_afp->if_format) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_LOCAL:
error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK,
xfs_dfork_attr_shortform_size(dip));
@@ -324,10 +329,8 @@ xfs_iformat_attr_fork(
break;
}
- if (error) {
- kmem_cache_free(xfs_ifork_cache, ip->i_afp);
- ip->i_afp = NULL;
- }
+ if (error)
+ xfs_ifork_zap_attr(ip);
return error;
}
@@ -371,7 +374,7 @@ xfs_iroot_realloc(
return;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
if (rec_diff > 0) {
/*
* If there wasn't any memory allocated before, just
@@ -401,7 +404,7 @@ xfs_iroot_realloc(
(int)new_size);
ifp->if_broot_bytes = (int)new_size;
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
return;
}
@@ -455,7 +458,7 @@ xfs_iroot_realloc(
ifp->if_broot_bytes = (int)new_size;
if (ifp->if_broot)
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
return;
}
@@ -481,11 +484,11 @@ xfs_idata_realloc(
int64_t byte_diff,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
int64_t new_size = ifp->if_bytes + byte_diff;
ASSERT(new_size >= 0);
- ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork));
+ ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
if (byte_diff == 0)
return;
@@ -497,12 +500,7 @@ xfs_idata_realloc(
return;
}
- /*
- * For inline data, the underlying buffer must be a multiple of 4 bytes
- * in size so that it can be logged and stay on word boundaries.
- * We enforce that here.
- */
- ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
@@ -545,7 +543,7 @@ xfs_iextents_copy(
int whichfork)
{
int state = xfs_bmap_fork_to_state(whichfork);
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec rec;
int64_t copied = 0;
@@ -597,7 +595,7 @@ xfs_iflush_fork(
if (!iip)
return;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
/*
* This can happen if we gave up in iformat in an error path,
* for the attribute fork.
@@ -613,7 +611,7 @@ xfs_iflush_fork(
if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
ASSERT(ifp->if_u1.if_data != NULL);
- ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+ ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
}
break;
@@ -632,7 +630,7 @@ xfs_iflush_fork(
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
- XFS_IFORK_SIZE(ip, whichfork));
+ xfs_inode_fork_size(ip, whichfork));
xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -662,7 +660,7 @@ xfs_iext_state_to_fork(
if (state & BMAP_COWFORK)
return ip->i_cowfp;
else if (state & BMAP_ATTRFORK)
- return ip->i_afp;
+ return &ip->i_af;
return &ip->i_df;
}
@@ -713,18 +711,17 @@ int
xfs_ifork_verify_local_attr(
struct xfs_inode *ip)
{
- struct xfs_ifork *ifp = ip->i_afp;
+ struct xfs_ifork *ifp = &ip->i_af;
xfs_failaddr_t fa;
- if (!ifp)
+ if (!xfs_inode_has_attr_fork(ip))
fa = __this_address;
else
fa = xfs_attr_shortform_verify(ip);
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp ? ifp->if_u1.if_data : NULL,
- ifp ? ifp->if_bytes : 0, fa);
+ ifp->if_u1.if_data, ifp->if_bytes, fa);
return -EFSCORRUPTED;
}
@@ -737,14 +734,15 @@ xfs_iext_count_may_overflow(
int whichfork,
int nr_to_add)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
uint64_t max_exts;
uint64_t nr_exts;
if (whichfork == XFS_COW_FORK)
return 0;
- max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
+ max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+ whichfork);
if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
max_exts = 10;
@@ -755,3 +753,27 @@ xfs_iext_count_may_overflow(
return 0;
}
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add. Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint nr_to_add)
+{
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+ if (!xfs_has_large_extent_counts(ip->i_mount) ||
+ xfs_inode_has_large_extent_counts(ip) ||
+ XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ return -EFBIG;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 3d64a3acb0ed..d3943d6ad0b9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -21,9 +21,9 @@ struct xfs_ifork {
void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
+ xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
- xfs_extnum_t if_nextents; /* # of extents in this fork */
};
/*
@@ -40,19 +40,6 @@ struct xfs_ifork {
#define XFS_IEXT_PUNCH_HOLE_CNT (1)
/*
- * Directory entry addition can cause the following,
- * 1. Data block can be added/removed.
- * A new extent can cause extent count to increase by 1.
- * 2. Free disk block can be added/removed.
- * Same behaviour as described above for Data block.
- * 3. Dabtree blocks.
- * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
- * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
- */
-#define XFS_IEXT_DIR_MANIP_CNT(mp) \
- ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
-
-/*
* Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
* be added. One extra extent for dabtree in case a local attr is
* large enough to cause a double split. It can also cause extent
@@ -90,28 +77,8 @@ struct xfs_ifork {
/*
* Fork handling.
*/
-
-#define XFS_IFORK_Q(ip) ((ip)->i_forkoff != 0)
-#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- &(ip)->i_df : \
- ((w) == XFS_ATTR_FORK ? \
- (ip)->i_afp : \
- (ip)->i_cowfp))
-#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0)
-#define XFS_IFORK_SIZE(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_IFORK_DSIZE(ip) : \
- ((w) == XFS_ATTR_FORK ? \
- XFS_IFORK_ASIZE(ip) : \
- 0))
#define XFS_IFORK_MAXEXT(ip, w) \
- (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+ (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t))
static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp)
{
@@ -133,8 +100,68 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
return ifp->if_format;
}
-struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
- xfs_extnum_t nextents);
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ case XFS_COW_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+ return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+ case XFS_ATTR_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be64_to_cpu(dip->di_big_nextents);
+
+ return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be32_to_cpu(dip->di_big_anextents);
+
+ return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_dfork_data_extents(dip);
+ case XFS_ATTR_FORK:
+ return xfs_dfork_attr_extents(dip);
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ return 0;
+}
+
+void xfs_ifork_zap_attr(struct xfs_inode *ip);
+void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format,
+ xfs_extnum_t nextents);
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
@@ -229,6 +256,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+ uint nr_to_add);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index b322db523d65..b351b9dc6561 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr)
/* Log Clients */
#define XFS_TRANSACTION 0x69
-#define XFS_VOLUME 0x2
#define XFS_LOG 0xaa
#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
@@ -114,7 +113,12 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_CUD_FORMAT 24
#define XLOG_REG_TYPE_BUI_FORMAT 25
#define XLOG_REG_TYPE_BUD_FORMAT 26
-#define XLOG_REG_TYPE_MAX 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_MAX 30
+
/*
* Flags to log operation header
@@ -237,6 +241,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_CUD 0x1243
#define XFS_LI_BUI 0x1244 /* bmbt update intent */
#define XFS_LI_BUD 0x1245
+#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
+#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -252,7 +258,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_CUI, "XFS_LI_CUI" }, \
{ XFS_LI_CUD, "XFS_LI_CUD" }, \
{ XFS_LI_BUI, "XFS_LI_BUI" }, \
- { XFS_LI_BUD, "XFS_LI_BUD" }
+ { XFS_LI_BUD, "XFS_LI_BUD" }, \
+ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
/*
* Inode Log Item Format definitions.
@@ -388,16 +396,41 @@ struct xfs_log_dinode {
uint32_t di_nlink; /* number of links to file */
uint16_t di_projid_lo; /* lower part of owner's project id */
uint16_t di_projid_hi; /* higher part of owner's project id */
- uint8_t di_pad[6]; /* unused, zeroed space */
- uint16_t di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ uint64_t di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ uint64_t di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ uint8_t di_v2_pad[6]; /* V2 inode zeroed space */
+ uint16_t di_flushiter; /* V2 inode incremented on flush */
+ };
+ };
xfs_log_timestamp_t di_atime; /* time last accessed */
xfs_log_timestamp_t di_mtime; /* time last modified */
xfs_log_timestamp_t di_ctime; /* time created/inode modified */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ uint32_t di_nextents;
+ uint16_t di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ uint32_t di_big_anextents;
+ uint16_t di_nrext64_pad;
+ } __packed;
+ } __packed;
uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
int8_t di_aformat; /* format of attr fork's data */
uint32_t di_dmevmask; /* DMIG event mask */
@@ -869,4 +902,44 @@ struct xfs_icreate_log {
__be32 icl_gen; /* inode generation number to use */
};
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */
+#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */
+#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
+
+/*
+ * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should
+ * never have any other bits set.
+ */
+#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \
+ XFS_ATTR_SECURE | \
+ XFS_ATTR_INCOMPLETE)
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+ uint16_t alfi_type; /* attri log item type */
+ uint16_t alfi_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfi_id; /* attri identifier */
+ uint64_t alfi_ino; /* the inode for this attr operation */
+ uint32_t alfi_op_flags; /* marks the op as a set or remove */
+ uint32_t alfi_name_len; /* attr name length */
+ uint32_t alfi_value_len; /* attr value length */
+ uint32_t alfi_attr_filter;/* attr filter flags */
+};
+
+struct xfs_attrd_log_format {
+ uint16_t alfd_type; /* attrd log item type */
+ uint16_t alfd_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfd_alf_id; /* id of corresponding attri */
+};
+
#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index ff69a0000817..2420865f3007 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops;
extern const struct xlog_recover_item_ops xlog_rud_item_ops;
extern const struct xlog_recover_item_ops xlog_cui_item_ops;
extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
@@ -108,12 +110,6 @@ struct xlog_recover {
#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
-/*
- * This is the number of entries in the l_buf_cancel_table used during
- * recovery.
- */
-#define XLOG_BC_TABLE_SIZE 64
-
#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
#define XLOG_RECOVER_PASS2 2
@@ -126,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_inode **ipp);
void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
uint64_t intent_id);
+int xlog_alloc_buf_cancel_table(struct xlog *log);
+void xlog_free_buf_cancel_table(struct xlog *log);
+
+#ifdef DEBUG
+void xlog_check_buf_cancel_table(struct xlog *log);
+#else
+#define xlog_check_buf_cancel_table(log) do { } while (0)
+#endif
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 67798ff5e14e..9975b93a7412 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -14,6 +14,7 @@
#include "xfs_trans_space.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
/*
* Calculate the maximum length in bytes that would be required for a local
@@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res(
}
/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resv)
+{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+ xfs_trans_resv_calc(mp, resv);
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * In the early days of reflink, typical log operation counts
+ * were greatly overestimated.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ resv->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ } else if (xfs_has_rmapbt(mp)) {
+ /*
+ * In the early days of non-reflink rmap, the impact of rmapbt
+ * updates on log counts were not taken into account at all.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ }
+
+ /*
+ * In the early days of reflink, we did not use deferred refcount
+ * update log items, so log reservations must be recomputed using the
+ * old calculations.
+ */
+ resv->tr_write.tr_logres =
+ xfs_calc_write_reservation_minlogsize(mp);
+ resv->tr_itruncate.tr_logres =
+ xfs_calc_itruncate_reservation_minlogsize(mp);
+ resv->tr_qm_dqalloc.tr_logres =
+ xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
+/*
* Iterate over the log space reservation table to figure out and return
* the maximum one in terms of the pre-calculated values which were done
* at mount time.
@@ -46,19 +106,25 @@ xfs_log_get_max_trans_res(
struct xfs_mount *mp,
struct xfs_trans_res *max_resp)
{
+ struct xfs_trans_resv resv = {};
struct xfs_trans_res *resp;
struct xfs_trans_res *end_resp;
+ unsigned int i;
int log_space = 0;
int attr_space;
attr_space = xfs_log_calc_max_attrsetm_res(mp);
- resp = (struct xfs_trans_res *)M_RES(mp);
- end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
- for (; resp < end_resp; resp++) {
+ xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+ resp = (struct xfs_trans_res *)&resv;
+ end_resp = (struct xfs_trans_res *)(&resv + 1);
+ for (i = 0; resp < end_resp; i++, resp++) {
int tmp = resp->tr_logcount > 1 ?
resp->tr_logres * resp->tr_logcount :
resp->tr_logres;
+
+ trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
if (log_space < tmp) {
log_space = tmp;
*max_resp = *resp; /* struct copy */
@@ -66,9 +132,10 @@ xfs_log_get_max_trans_res(
}
if (attr_space > log_space) {
- *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
+ *max_resp = resv.tr_attrsetm; /* struct copy */
max_resp->tr_logres = attr_space;
}
+ trace_xfs_log_get_max_trans_res(mp, max_resp);
}
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index a02c5062f9b2..cb035da3f990 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -16,7 +16,6 @@
* and quota-limits. This is a waste in the common case, but hey ...
*/
typedef uint64_t xfs_qcnt_t;
-typedef uint16_t xfs_qwarncnt_t;
typedef uint8_t xfs_dqtype_t;
@@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t;
/*
* flags for q_flags field in the dquot.
*/
-#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */
-#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */
+#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */
+#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */
#define XFS_DQFLAG_STRINGS \
{ XFS_DQFLAG_DIRTY, "DIRTY" }, \
@@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t;
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
-#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
+#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
* modified.
*/
-#define XFS_QMOPT_RES_REGBLKS 0x0010000
-#define XFS_QMOPT_RES_RTBLKS 0x0020000
-#define XFS_QMOPT_BCOUNT 0x0040000
-#define XFS_QMOPT_ICOUNT 0x0080000
-#define XFS_QMOPT_RTBCOUNT 0x0100000
-#define XFS_QMOPT_DELBCOUNT 0x0200000
-#define XFS_QMOPT_DELRTBCOUNT 0x0400000
-#define XFS_QMOPT_RES_INOS 0x0800000
+#define XFS_QMOPT_RES_REGBLKS (1u << 7)
+#define XFS_QMOPT_RES_RTBLKS (1u << 8)
+#define XFS_QMOPT_BCOUNT (1u << 9)
+#define XFS_QMOPT_ICOUNT (1u << 10)
+#define XFS_QMOPT_RTBCOUNT (1u << 11)
+#define XFS_QMOPT_DELBCOUNT (1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT (1u << 13)
+#define XFS_QMOPT_RES_INOS (1u << 14)
/*
* flags for dqalloc.
*/
-#define XFS_QMOPT_INHERIT 0x1000000
+#define XFS_QMOPT_INHERIT (1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+ { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
+ { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
+ { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
+ { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
+ { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
+ { XFS_QMOPT_INHERIT, "INHERIT" }, \
+ { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
+ { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
+ { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
+ { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
+ { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
+ { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
+ { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
+ { XFS_QMOPT_RES_INOS, "RES_INOS" }
/*
* flags to xfs_trans_mod_dquot.
@@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t;
(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
struct xfs_disk_dquot *ddq, xfs_dqid_t id);
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 327ba25e9e17..64b910caafaa 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -111,7 +111,7 @@ xfs_refcount_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
xfs_agblock_t realstart;
@@ -121,8 +121,6 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
-
- agno = cur->bc_ag.pag->pag_agno;
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
@@ -137,22 +135,23 @@ xfs_refcount_get_rec(
}
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, realstart))
+ if (!xfs_verify_agbno(pag, realstart))
goto out_bad_rec;
if (realstart > realstart + irec->rc_blockcount)
goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1))
+ if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1))
goto out_bad_rec;
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
goto out_bad_rec;
- trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+ trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec);
return 0;
out_bad_rec:
xfs_warn(mp,
- "Refcount BTree record corruption in AG %d detected!", agno);
+ "Refcount BTree record corruption in AG %d detected!",
+ pag->pag_agno);
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -886,8 +885,13 @@ xfs_refcount_still_have_space(
{
unsigned long overhead;
- overhead = cur->bc_ag.refc.shape_changes *
- xfs_allocfree_log_count(cur->bc_mp, 1);
+ /*
+ * Worst case estimate: full splits of the free space and rmap btrees
+ * to handle each of the shape changes to the refcount btree.
+ */
+ overhead = xfs_allocfree_block_count(cur->bc_mp,
+ cur->bc_ag.refc.shape_changes);
+ overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
@@ -960,6 +964,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
+ cur->bc_ag.refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -970,7 +975,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
@@ -1001,11 +1005,11 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
+ cur->bc_ag.refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
- cur->bc_ag.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
@@ -1014,7 +1018,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
@@ -1173,8 +1176,8 @@ xfs_refcount_finish_one(
*pcur = NULL;
}
if (rcur == NULL) {
- error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno,
- XFS_ALLOC_FLAG_FREEING, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING,
+ &agbp);
if (error)
goto out_drop;
@@ -1706,7 +1709,7 @@ xfs_refcount_recover_cow_leftovers(
if (error)
return error;
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
goto out_trans;
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9eb01edbd89d..e8b322de7f3d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
* log (plus any key updates) so we'll conservatively assume 32 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
-static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
-{
- return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
-}
-
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d14c1720b0fb..316c1ec0c3c2 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -493,7 +493,7 @@ xfs_refcountbt_calc_reserves(
if (!xfs_has_reflink(mp))
return 0;
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
@@ -507,8 +507,7 @@ xfs_refcountbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
*ask += xfs_refcountbt_max_size(mp, agblocks);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index cd322174dbff..094dfc897ebc 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -34,18 +34,32 @@ int
xfs_rmap_lookup_le(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
- xfs_extlen_t len,
uint64_t owner,
uint64_t offset,
unsigned int flags,
+ struct xfs_rmap_irec *irec,
int *stat)
{
+ int get_stat = 0;
+ int error;
+
cur->bc_rec.r.rm_startblock = bno;
- cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_blockcount = 0;
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ if (error || !(*stat) || !irec)
+ return error;
+
+ error = xfs_rmap_get_rec(cur, irec, &get_stat);
+ if (error)
+ return error;
+ if (!get_stat)
+ return -EFSCORRUPTED;
+
+ return 0;
}
/*
@@ -201,7 +215,7 @@ xfs_rmap_get_rec(
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = cur->bc_ag.pag;
union xfs_btree_rec *rec;
int error;
@@ -221,12 +235,12 @@ xfs_rmap_get_rec(
goto out_bad_rec;
} else {
/* check for valid extent range, including overflow */
- if (!xfs_verify_agbno(mp, agno, irec->rm_startblock))
+ if (!xfs_verify_agbno(pag, irec->rm_startblock))
goto out_bad_rec;
if (irec->rm_startblock >
irec->rm_startblock + irec->rm_blockcount)
goto out_bad_rec;
- if (!xfs_verify_agbno(mp, agno,
+ if (!xfs_verify_agbno(pag,
irec->rm_startblock + irec->rm_blockcount - 1))
goto out_bad_rec;
}
@@ -240,7 +254,7 @@ xfs_rmap_get_rec(
out_bad_rec:
xfs_warn(mp,
"Reverse Mapping BTree record corruption in AG %d detected!",
- agno);
+ pag->pag_agno);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
@@ -251,7 +265,6 @@ out_bad_rec:
struct xfs_find_left_neighbor_info {
struct xfs_rmap_irec high;
struct xfs_rmap_irec *irec;
- int *stat;
};
/* For each rmap given, figure out if it matches the key we want. */
@@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper(
* return a match with the same owner and adjacent physical and logical
* block ranges.
*/
-int
+STATIC int
xfs_rmap_find_left_neighbor(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
@@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
*stat = 0;
@@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor(
info.high.rm_flags = flags;
info.high.rm_blockcount = 0;
info.irec = irec;
- info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_find_left_neighbor_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * extent conversion and remap operations run a bit faster if the
+ * physical extents aren't being shared. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/* For each rmap given, figure out if it matches the key we want. */
@@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
info.high.rm_startblock = bno;
@@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range(
info.high.rm_blockcount = 0;
*stat = 0;
info.irec = irec;
- info.stat = stat;
- trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_lookup_le_range_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ bno, 0, owner, offset, flags);
+
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * scrub run much faster on most filesystems because bmbt records are
+ * usually an exact match for rmap records. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/*
@@ -510,7 +570,7 @@ xfs_rmap_unmap(
* for the AG headers at rm_startblock == 0 created by mkfs/growfs that
* will not ever be removed from the tree.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -518,13 +578,6 @@ xfs_rmap_unmap(
goto out_error;
}
- error = xfs_rmap_get_rec(cur, &ltrec, &i);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -786,18 +839,11 @@ xfs_rmap_map(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
&have_lt);
if (error)
goto out_error;
if (have_lt) {
- error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -1022,7 +1068,7 @@ xfs_rmap_convert(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -1030,13 +1076,6 @@ xfs_rmap_convert(
goto done;
}
- error = xfs_rmap_get_rec(cur, &PREV, &i);
- if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
@@ -1140,7 +1179,7 @@ xfs_rmap_convert(
_RET_IP_);
/* reset the cursor back to PREV */
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -2677,7 +2716,7 @@ xfs_rmap_record_exists(
ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
(flags & XFS_RMAP_BMBT_BLOCK));
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
&has_record);
if (error)
return error;
@@ -2686,14 +2725,6 @@ xfs_rmap_record_exists(
return 0;
}
- error = xfs_rmap_get_rec(cur, &irec, &has_record);
- if (error)
- return error;
- if (!has_record) {
- *has_rmap = false;
- return 0;
- }
-
*has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
irec.rm_startblock + irec.rm_blockcount >= bno + len);
return 0;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index b718ebeda372..54741a591a17 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, uint64_t owner, uint64_t offset,
- unsigned int flags, int *stat);
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, uint64_t owner, uint64_t offset,
unsigned int flags, int *stat);
@@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
-int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- uint64_t owner, uint64_t offset, unsigned int flags,
- struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 69e104d0277f..7f83f62e51e0 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -90,7 +90,7 @@ xfs_rmapbt_alloc_block(
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
+ error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
&bno, 1);
if (error)
return error;
@@ -129,7 +129,7 @@ xfs_rmapbt_free_block(
bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
if (error)
return error;
@@ -652,7 +652,7 @@ xfs_rmapbt_calc_reserves(
if (!xfs_has_rmapbt(mp))
return 0;
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
@@ -666,8 +666,7 @@ xfs_rmapbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (mp->m_sb.sb_logstart &&
- XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
+ if (xfs_ag_contains_log(mp, pag->pag_agno))
agblocks -= mp->m_sb.sb_logblocks;
/* Reserve 1% of the AG or enough for 1 block per record. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5740ba664867..fa180ab66b73 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1008,6 +1008,7 @@ xfs_rtfree_extent(
/* Find all the free records within a given range. */
int
xfs_rtalloc_query_range(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
@@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range(
void *priv)
{
struct xfs_rtalloc_rec rec;
- struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
xfs_rtblock_t high_key;
@@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range(
rec.ar_startext = rtstart;
rec.ar_extcount = rtend - rtstart + 1;
- error = fn(tp, &rec, priv);
+ error = fn(mp, tp, &rec, priv);
if (error)
break;
}
@@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range(
/* Find all the free records. */
int
xfs_rtalloc_query_all(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv)
@@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all(
struct xfs_rtalloc_rec keys[2];
keys[0].ar_startext = 0;
- keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+ keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
keys[0].ar_extcount = keys[1].ar_extcount = 0;
- return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+ return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
}
/* Is the given extent all free? */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index f4e84aa1d50a..a20cade590e9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -31,15 +31,66 @@
*/
/*
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
+ */
+static bool
+xfs_sb_validate_v5_features(
+ struct xfs_sb *sbp)
+{
+ /* We must not have any unknown V4 feature bits set */
+ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+ return false;
+
+ /*
+ * The CRC bit is considered an invalid V4 flag, so we have to add it
+ * manually to the OKBITS mask.
+ */
+ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+ XFS_SB_VERSION2_CRCBIT))
+ return false;
+
+ /* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \
+ XFS_SB_VERSION_ALIGNBIT | \
+ XFS_SB_VERSION_LOGV2BIT | \
+ XFS_SB_VERSION_EXTFLGBIT | \
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_CRCBIT)
+
+ if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+ return false;
+ if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+ return false;
+ return true;
+}
+
+/*
* We support all XFS versions newer than a v4 superblock with V2 directories.
*/
bool
xfs_sb_good_version(
struct xfs_sb *sbp)
{
- /* all v5 filesystems are supported */
+ /*
+ * All v5 filesystems are supported, but we must check that all the
+ * required v4 feature flags are enabled correctly as the code checks
+ * those flags and not for v5 support.
+ */
if (xfs_sb_is_v5(sbp))
- return true;
+ return xfs_sb_validate_v5_features(sbp);
+
+ /* We must not have any unknown v4 feature bits set */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
/* versions prior to v4 are not supported */
if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
@@ -51,12 +102,6 @@ xfs_sb_good_version(
if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
return false;
- /* And must not have any unknown v4 feature bits set */
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
- return false;
-
/* It's a supported v4 filesystem */
return true;
}
@@ -70,6 +115,8 @@ xfs_sb_version_to_features(
/* optional V4 features */
if (sbp->sb_rblocks > 0)
features |= XFS_FEAT_REALTIME;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+ features |= XFS_FEAT_NLINK;
if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
features |= XFS_FEAT_ATTR;
if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
@@ -124,6 +171,9 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_BIGTIME;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
features |= XFS_FEAT_NEEDSREPAIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+ features |= XFS_FEAT_NREXT64;
+
return features;
}
@@ -262,12 +312,15 @@ xfs_validate_sb_common(
bool has_dalign;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
- xfs_warn(mp, "bad magic number");
+ xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+ be32_to_cpu(dsb->sb_magicnum));
return -EWRONGFS;
}
if (!xfs_sb_good_version(sbp)) {
- xfs_warn(mp, "bad version");
+ xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
return -EWRONGFS;
}
@@ -911,6 +964,11 @@ xfs_log_sb(
* reservations that have been taken out percpu counters. If we have an
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
+ *
+ * Do not update sb_frextents here because it is not part of the lazy
+ * sb counters, despite having a percpu counter. It is always kept
+ * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+ * and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
@@ -1135,6 +1193,8 @@ xfs_fs_geometry(
} else {
geo->logsectsize = BBSIZE;
}
+ if (xfs_has_large_extent_counts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 25c4cab58851..c4381388c0c1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
/*
* Values for t_flags.
*/
-#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
-#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
-#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
-#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
-#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
-#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY (1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY (1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES (1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC (1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE (1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT (1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS (1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
+
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index f0b38f4aba80..bdc777b9ec4a 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -204,7 +204,7 @@ xfs_failaddr_t
xfs_symlink_shortform_verify(
struct xfs_inode *ip)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
char *sfp = (char *)ifp->if_u1.if_data;
int size = ifp->if_bytes;
char *endp = sfp + size;
@@ -213,7 +213,7 @@ xfs_symlink_shortform_verify(
/*
* Zero length symlinks should never occur in memory as they are
- * never alllowed to exist on disk.
+ * never allowed to exist on disk.
*/
if (!size)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6f83d9b306ee..2c4ad6e4bb14 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -56,15 +56,14 @@ xfs_calc_buf_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating an extent. In classic XFS there were two trees that will be
* modified (bnobt + cntbt). With rmap enabled, there are three trees
- * (rmapbt). With reflink, there are four trees (refcountbt). The number of
- * blocks reserved is based on the formula:
+ * (rmapbt). The number of blocks reserved is based on the formula:
*
* num trees * ((2 blocks/level * max depth) - 1)
*
* Keep in mind that max depth is calculated separately for each type of tree.
*/
uint
-xfs_allocfree_log_count(
+xfs_allocfree_block_count(
struct xfs_mount *mp,
uint num_ops)
{
@@ -73,13 +72,24 @@ xfs_allocfree_log_count(
blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
- if (xfs_has_reflink(mp))
- blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
}
/*
+ * Per-extent log reservation for refcount btree changes. These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
+/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
* the amount of space they use on disk.
@@ -136,7 +146,7 @@ xfs_calc_inobt_res(
{
return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res(
{
uint res, size = 0;
- res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
@@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res(
/*
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
- * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
- * as well as the realtime summary block.
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
*/
static unsigned int
-xfs_rtalloc_log_count(
+xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
unsigned int rtbmp_bytes;
- rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+ rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
}
@@ -233,6 +243,28 @@ xfs_rtalloc_log_count(
* register overflow from temporaries in the calculations.
*/
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction. Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
/*
* In a write transaction we can allocate a maximum of 2
@@ -247,7 +279,7 @@ xfs_rtalloc_log_count(
* the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
- * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 1 block
* the allocation btrees: 2 trees * (2 * max depth - 1) * block size
* And the bmap_finish transaction can free bmap blocks in a join (t3):
@@ -255,34 +287,65 @@ xfs_rtalloc_log_count(
* the agfls of the ags containing the blocks: 2 * sector size
* the super block free block counter: sector size
* the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_write_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
if (xfs_has_realtime(mp)) {
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
} else {
t2 = 0;
}
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * two refcountbt splits for each transaction. The codebase runs
+ * refcountbt updates in separate transactions now, so to compute the
+ * minimum log size, add the refcountbtree splits back to t1 and t3 and
+ * do not account them separately as t4. Reflink did not support
+ * realtime when the reservations were established, so no adjustment to
+ * t2 is needed.
+ */
+ if (for_minlogsize) {
+ unsigned int adj = 0;
+
+ if (xfs_has_reflink(mp))
+ adj = xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 2),
+ blksz);
+ t1 += adj;
+ t3 += adj;
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 1);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_write_reservation(mp, true);
}
/*
@@ -299,33 +362,62 @@ xfs_calc_write_reservation(
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
- * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_itruncate_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
} else {
t3 = 0;
}
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * four refcountbt splits in the same transaction as bnobt/cntbt
+ * updates. The codebase runs refcountbt updates in separate
+ * transactions now, so to compute the minimum log size, add the
+ * refcount btree splits back here and do not compute them separately
+ * as t4. Reflink did not support realtime when the reservations were
+ * established, so do not adjust t3.
+ */
+ if (for_minlogsize) {
+ if (xfs_has_reflink(mp))
+ t2 += xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 4),
+ blksz);
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 2);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_itruncate_reservation(mp, true);
}
/*
@@ -349,7 +441,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
XFS_FSB_TO_B(mp, 1))));
}
@@ -389,7 +481,7 @@ xfs_calc_link_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1))));
}
@@ -423,11 +515,11 @@ xfs_calc_remove_reservation(
{
return XFS_DQUOT_LOGRES(mp) +
xfs_calc_iunlink_add_reservation(mp) +
- max((xfs_calc_inode_res(mp, 1) +
+ max((xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -572,7 +664,7 @@ xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -670,7 +762,7 @@ xfs_calc_addafork_reservation(
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
XFS_FSB_TO_B(mp, 1))));
}
@@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation(
XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void)
*/
STATIC uint
xfs_calc_qm_dqalloc_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- return xfs_calc_write_reservation(mp) +
+ return xfs_calc_write_reservation(mp, for_minlogsize) +
xfs_calc_buf_res(1,
XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
}
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_qm_dqalloc_reservation(mp, true);
+}
+
/*
* Syncing the incore super block changes to disk.
* the super block to reflect the changes: sector size
@@ -814,36 +914,18 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
- unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
-
- /*
- * In the early days of rmap+reflink, we always set the rmap maxlevels
- * to 9 even if the AG was small enough that it would never grow to
- * that height. Transaction reservation sizes influence the minimum
- * log size calculation, which influences the size of the log that mkfs
- * creates. Use the old value here to ensure that newly formatted
- * small filesystems will mount on older kernels.
- */
- if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
- mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+ int logcount_adj = 0;
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
*/
- resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_itruncate.tr_logcount =
- XFS_ITRUNCATE_LOG_COUNT_REFLINK;
- else
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -899,11 +981,9 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+ false);
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
/*
@@ -930,6 +1010,19 @@ xfs_trans_resv_calc(
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
- /* Put everything back the way it was. This goes at the end. */
- mp->m_rmap_maxlevels = rmap_maxlevels;
+ /*
+ * Add one logcount for BUI items that appear with rmap or reflink,
+ * one logcount for refcount intent items, and one logcount for rmap
+ * intent items.
+ */
+ if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+ logcount_adj++;
+ if (xfs_has_reflink(mp))
+ logcount_adj++;
+ if (xfs_has_rmapbt(mp))
+ logcount_adj++;
+
+ resp->tr_itruncate.tr_logcount += logcount_adj;
+ resp->tr_write.tr_logcount += logcount_adj;
+ resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index fc4e9b369a3a..0554b9d775d2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,7 +73,6 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
-#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
@@ -83,13 +82,24 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
-#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink. These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
+
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
-uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index e810d23f2d97..5c2765934732 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -13,25 +13,13 @@
#include "xfs_mount.h"
#include "xfs_ag.h"
-/* Find the size of the AG, in blocks. */
-inline xfs_agblock_t
-xfs_ag_block_count(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- ASSERT(agno < mp->m_sb.sb_agcount);
-
- if (agno < mp->m_sb.sb_agcount - 1)
- return mp->m_sb.sb_agblocks;
- return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks);
-}
/*
* Verify that an AG block number pointer neither points outside the AG
* nor points at static metadata.
*/
-inline bool
-xfs_verify_agbno(
+static inline bool
+xfs_verify_agno_agbno(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agblock_t agbno)
@@ -59,7 +47,7 @@ xfs_verify_fsbno(
if (agno >= mp->m_sb.sb_agcount)
return false;
- return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
+ return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno));
}
/*
@@ -85,40 +73,12 @@ xfs_verify_fsbext(
XFS_FSB_TO_AGNO(mp, fsbno + len - 1);
}
-/* Calculate the first and last possible inode number in an AG. */
-inline void
-xfs_agino_range(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t *first,
- xfs_agino_t *last)
-{
- xfs_agblock_t bno;
- xfs_agblock_t eoag;
-
- eoag = xfs_ag_block_count(mp, agno);
-
- /*
- * Calculate the first inode, which will be in the first
- * cluster-aligned block after the AGFL.
- */
- bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
- *first = XFS_AGB_TO_AGINO(mp, bno);
-
- /*
- * Calculate the last inode, which will be at the end of the
- * last (aligned) cluster that can be allocated in the AG.
- */
- bno = round_down(eoag, M_IGEO(mp)->cluster_align);
- *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
-}
-
/*
* Verify that an AG inode number pointer neither points outside the AG
* nor points at static metadata.
*/
-inline bool
-xfs_verify_agino(
+static inline bool
+xfs_verify_agno_agino(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agino_t agino)
@@ -131,19 +91,6 @@ xfs_verify_agino(
}
/*
- * Verify that an AG inode number pointer neither points outside the AG
- * nor points at static metadata, or is NULLAGINO.
- */
-bool
-xfs_verify_agino_or_null(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t agino)
-{
- return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
-}
-
-/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -159,7 +106,7 @@ xfs_verify_ino(
return false;
if (XFS_AGINO_TO_INO(mp, agno, agino) != ino)
return false;
- return xfs_verify_agino(mp, agno, agino);
+ return xfs_verify_agno_agino(mp, agno, agino);
}
/* Is this an internal inode number? */
@@ -229,12 +176,8 @@ xfs_icount_range(
/* root, rtbitmap, rtsum all live in the first chunk */
*min = XFS_INODES_PER_CHUNK;
- for_each_perag(mp, agno, pag) {
- xfs_agino_t first, last;
-
- xfs_agino_range(mp, agno, &first, &last);
- nr_inos += last - first + 1;
- }
+ for_each_perag(mp, agno, pag)
+ nr_inos += pag->agino_max - pag->agino_min + 1;
*max = nr_inos;
}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b6da06b40989..a6b7d98cf68f 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
-typedef int32_t xfs_extnum_t; /* # of extents in a file */
-typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef uint64_t xfs_extnum_t; /* # of extents in a file */
+typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
@@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t;
#define NULLAGINO ((xfs_agino_t)-1)
/*
- * Max values for extlen, extnum, aextnum.
- */
-#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
-#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
-#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
-
-/*
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
@@ -186,19 +179,10 @@ enum xfs_ag_resv_type {
*/
struct xfs_mount;
-xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno);
-bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno);
bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
xfs_fsblock_t len);
-void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t *first, xfs_agino_t *last);
-bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t agino);
-bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90aebfe9dc5f..b7b838bd4ba4 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -541,16 +541,16 @@ xchk_agf(
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
- if (eoag != xfs_ag_block_count(mp, agno))
+ if (eoag != pag->block_count)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
@@ -563,7 +563,7 @@ xchk_agf(
if (xfs_has_rmapbt(mp)) {
agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
@@ -573,7 +573,7 @@ xchk_agf(
if (xfs_has_reflink(mp)) {
agbno = be32_to_cpu(agf->agf_refcount_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
level = be32_to_cpu(agf->agf_refcount_level);
@@ -639,9 +639,8 @@ xchk_agfl_block(
{
struct xchk_agfl_info *sai = priv;
struct xfs_scrub *sc = sai->sc;
- xfs_agnumber_t agno = sc->sa.pag->pag_agno;
- if (xfs_verify_agbno(mp, agno, agbno) &&
+ if (xfs_verify_agbno(sc->sa.pag, agbno) &&
sai->nr_entries < sai->sz_entries)
sai->entries[sai->nr_entries++] = agbno;
else
@@ -871,12 +870,12 @@ xchk_agi(
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
- if (eoag != xfs_ag_block_count(mp, agno))
+ if (eoag != pag->block_count)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check btree roots and levels */
agbno = be32_to_cpu(agi->agi_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_level);
@@ -885,7 +884,7 @@ xchk_agi(
if (xfs_has_finobt(mp)) {
agbno = be32_to_cpu(agi->agi_free_root);
- if (!xfs_verify_agbno(mp, agno, agbno))
+ if (!xfs_verify_agbno(pag, agbno))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
level = be32_to_cpu(agi->agi_free_level);
@@ -902,17 +901,17 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (!xfs_verify_agino_or_null(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(pag, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 6da7f2ca77de..1b0b4e243f77 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -106,7 +106,7 @@ xrep_agf_check_agfl_block(
{
struct xfs_scrub *sc = priv;
- if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno))
+ if (!xfs_verify_agbno(sc->sa.pag, agbno))
return -EFSCORRUPTED;
return 0;
}
@@ -130,10 +130,7 @@ xrep_check_btree_root(
struct xfs_scrub *sc,
struct xrep_find_ag_btree *fab)
{
- struct xfs_mount *mp = sc->mp;
- xfs_agnumber_t agno = sc->sm->sm_agno;
-
- return xfs_verify_agbno(mp, agno, fab->root) &&
+ return xfs_verify_agbno(sc->sa.pag, fab->root) &&
fab->height <= fab->maxlevels;
}
@@ -201,8 +198,7 @@ xrep_agf_init_header(
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
- agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp,
- sc->sa.pag->pag_agno));
+ agf->agf_length = cpu_to_be32(sc->sa.pag->block_count);
agf->agf_flfirst = old_agf->agf_flfirst;
agf->agf_fllast = old_agf->agf_fllast;
agf->agf_flcount = old_agf->agf_flcount;
@@ -405,7 +401,7 @@ xrep_agf(
* btrees rooted in the AGF. If the AGFL contents are obviously bad
* then we'll bail out.
*/
- error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp);
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
if (error)
return error;
@@ -666,8 +662,7 @@ xrep_agfl(
* nothing wrong with the AGF, but all the AG header repair functions
* have this chicken-and-egg problem.
*/
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
- &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
@@ -742,8 +737,7 @@ xrep_agi_find_btrees(
int error;
/* Read the AGF. */
- error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0,
- &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
@@ -782,8 +776,7 @@ xrep_agi_init_header(
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
- agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp,
- sc->sa.pag->pag_agno));
+ agi->agi_length = cpu_to_be32(sc->sa.pag->block_count);
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_has_crc(mp))
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 87518e1292f8..ab427b4d7fe0 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -93,8 +93,7 @@ xchk_allocbt_rec(
struct xchk_btree *bs,
const union xfs_btree_rec *rec)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
xfs_extlen_t len;
@@ -102,8 +101,8 @@ xchk_allocbt_rec(
len = be32_to_cpu(rec->alloc.ar_blockcount);
if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ !xfs_verify_agbno(pag, bno) ||
+ !xfs_verify_agbno(pag, bno + len - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_allocbt_xref(bs->sc, bno, len);
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index a4cbbc346f60..f0b9cb6506fd 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -133,29 +133,13 @@ xchk_bmap_get_rmap(
if (info->is_shared) {
error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
owner, offset, rflags, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
- goto out;
+ } else {
+ error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
+ owner, offset, rflags, rmap, &has_rmap);
}
-
- /*
- * Otherwise, use the (faster) regular lookup.
- */
- error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner,
- offset, rflags, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
+ if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
return false;
- if (!has_rmap)
- goto out;
- error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
-
-out:
if (!has_rmap)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -350,7 +334,7 @@ xchk_bmap_iextent(
irec->br_startoff);
/* Make sure the extent points to a valid place. */
- if (irec->br_blockcount > MAXEXTLEN)
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (info->is_rt &&
@@ -393,7 +377,7 @@ xchk_bmapbt_rec(
struct xfs_inode *ip = bs->cur->bc_ino.ip;
struct xfs_buf *bp = NULL;
struct xfs_btree_block *block;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, info->whichfork);
uint64_t owner;
int i;
@@ -442,7 +426,7 @@ xchk_bmap_btree(
struct xchk_bmap_info *info)
{
struct xfs_owner_info oinfo;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
struct xfs_btree_cur *cur;
@@ -494,7 +478,7 @@ xchk_bmap_check_rmap(
return 0;
/* Now look up the bmbt record. */
- ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork);
+ ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork);
if (!ifp) {
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
@@ -556,7 +540,7 @@ xchk_bmap_check_ag_rmaps(
struct xfs_buf *agf;
int error;
- error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf);
if (error)
return error;
@@ -579,7 +563,7 @@ xchk_bmap_check_rmaps(
struct xfs_scrub *sc,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
struct xfs_perag *pag;
xfs_agnumber_t agno;
bool zero_size;
@@ -594,7 +578,7 @@ xchk_bmap_check_rmaps(
if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
return 0;
- ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL);
+ ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
/*
* Only do this for complex maps that are in btree format, or for
@@ -640,7 +624,7 @@ xchk_bmap(
struct xchk_bmap_info info = { NULL };
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t endoff;
struct xfs_iext_cursor icur;
int error = 0;
@@ -705,7 +689,7 @@ xchk_bmap(
/* Scrub extent records. */
info.lastoff = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ ifp = xfs_ifork_ptr(ip, whichfork);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 39dd46f038fe..2f4519590dc1 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -462,7 +462,7 @@ xchk_btree_check_iroot_minrecs(
*/
if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
- XFS_IFORK_Q(bs->sc->ip))
+ xfs_inode_has_attr_fork(bs->sc->ip))
return false;
return true;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index bf1f3607d0b6..9bbbf20f401b 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -23,6 +23,8 @@
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
@@ -414,15 +416,15 @@ xchk_ag_read_headers(
if (!sa->pag)
return -ENOENT;
- error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp);
+ error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
return error;
- error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp);
+ error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
return error;
- error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp);
+ error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
return error;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index b962cfbbd92b..84fe3d33d699 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -482,7 +482,7 @@ xchk_da_btree(
int error;
/* Skip short format data structures; no btree to scan. */
- if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork)))
+ if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork)))
return 0;
/* Set up initial da state. */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 38897adde7b5..b594f02a52c4 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -99,7 +99,7 @@ out:
* we check the inode number to make sure it's sane, then we check that
* we can look up this filename. Finally, we check the ftype.
*/
-STATIC int
+STATIC bool
xchk_dir_actor(
struct dir_context *dir_iter,
const char *name,
@@ -124,7 +124,7 @@ xchk_dir_actor(
xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
if (xchk_should_terminate(sdc->sc, &error))
- return error;
+ return !error;
/* Does this inode number make sense? */
if (!xfs_verify_dir_ino(mp, ino)) {
@@ -191,8 +191,8 @@ out:
* and return zero to xchk_directory.
*/
if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return -EFSCORRUPTED;
- return error;
+ return false;
+ return !error;
}
/* Scrub a directory btree record. */
@@ -667,7 +667,7 @@ xchk_directory_blocks(
{
struct xfs_bmbt_irec got;
struct xfs_da_args args;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
xfs_fileoff_t leaf_lblk;
xfs_fileoff_t free_lblk;
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 48a6cbdf95d0..6a6f8fe7f87c 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -78,10 +78,10 @@ xchk_fscount_warmup(
continue;
/* Lock both AG headers. */
- error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+ error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
if (error)
break;
- error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
if (error)
break;
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 2e61df3bca83..aa65ec88a0c0 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -8,6 +8,8 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 00848ee542fb..e1026e07bf94 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -104,13 +104,13 @@ xchk_iallocbt_chunk(
xfs_extlen_t len)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ !xfs_verify_agbno(pag, bno) ||
+ !xfs_verify_agbno(pag, bno + len - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len);
@@ -421,10 +421,10 @@ xchk_iallocbt_rec(
const union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agino_t agino;
xfs_extlen_t len;
int holecount;
@@ -446,8 +446,8 @@ xchk_iallocbt_rec(
agino = irec.ir_startino;
/* Record has to be properly aligned within the AG. */
- if (!xfs_verify_agino(mp, agno, agino) ||
- !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) {
+ if (!xfs_verify_agino(pag, agino) ||
+ !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index eac15af7b08c..51820b40ab1c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -232,7 +232,8 @@ xchk_dinode(
size_t fork_recs;
unsigned long long isize;
uint64_t flags2;
- uint32_t nextents;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
prid_t prid;
uint16_t flags;
uint16_t mode;
@@ -390,8 +391,10 @@ xchk_dinode(
xchk_inode_extsize(sc, dip, ino, mode, flags);
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+
/* di_nextents */
- nextents = be32_to_cpu(dip->di_nextents);
fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_format) {
case XFS_DINODE_FMT_EXTENTS:
@@ -411,7 +414,7 @@ xchk_dinode(
/* di_forkoff */
if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
- if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
xchk_ino_set_corrupt(sc, ino);
@@ -423,19 +426,18 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
/* di_anextents */
- nextents = be16_to_cpu(dip->di_anextents);
fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- if (nextents > fork_recs)
+ if (naextents > fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
- if (nextents <= fork_recs)
+ if (naextents <= fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
default:
- if (nextents != 0)
+ if (naextents != 0)
xchk_ino_set_corrupt(sc, ino);
}
@@ -513,14 +515,14 @@ xchk_inode_xref_bmap(
&nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents < be32_to_cpu(dip->di_nextents))
+ if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
&nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents != be16_to_cpu(dip->di_anextents))
+ if (nextents != xfs_dfork_attr_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
/* Check nblocks against the inode. */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index ab182a5cd0c0..d8dff3fd8053 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -38,7 +38,7 @@ struct xchk_parent_ctx {
};
/* Look for a single entry in a directory pointing to an inode. */
-STATIC int
+STATIC bool
xchk_parent_actor(
struct dir_context *dc,
const char *name,
@@ -62,7 +62,7 @@ xchk_parent_actor(
if (xchk_should_terminate(spc->sc, &error))
spc->cancelled = true;
- return error;
+ return !error;
}
/* Count the number of dentries in the parent dir that point to this inode. */
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 3c7506c7553c..21b4c9006859 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -185,7 +185,7 @@ xchk_quota_data_fork(
/* Check for data fork problems that apply only to quota files. */
max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
- ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
break;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 2744eecdbaf0..c68b767dc08f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -13,6 +13,8 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_ag.h"
/*
@@ -332,9 +334,8 @@ xchk_refcountbt_rec(
struct xchk_btree *bs,
const union xfs_btree_rec *rec)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
xfs_agblock_t *cow_blocks = bs->private;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
xfs_agblock_t bno;
xfs_extlen_t len;
xfs_nlink_t refcount;
@@ -354,8 +355,8 @@ xchk_refcountbt_rec(
/* Check the extent. */
bno &= ~XFS_REFC_COW_START;
if (bno + len <= bno ||
- !xfs_verify_agbno(mp, agno, bno) ||
- !xfs_verify_agbno(mp, agno, bno + len - 1))
+ !xfs_verify_agbno(pag, bno) ||
+ !xfs_verify_agbno(pag, bno + len - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
if (refcount == 0)
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1e7b6b209ee8..c18bd039fce9 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -199,7 +199,7 @@ xrep_calc_ag_resblks(
icount = pag->pagi_count;
} else {
/* Try to get the actual counters from disk. */
- error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+ error = xfs_ialloc_read_agi(pag, NULL, &bp);
if (!error) {
icount = pag->pagi_count;
xfs_buf_relse(bp);
@@ -207,9 +207,9 @@ xrep_calc_ag_resblks(
}
/* Now grab the block counters from the AGF. */
- error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
if (error) {
- aglen = xfs_ag_block_count(mp, sm->sm_agno);
+ aglen = pag->block_count;
freelen = aglen;
usedlen = aglen;
} else {
@@ -220,25 +220,22 @@ xrep_calc_ag_resblks(
usedlen = aglen - freelen;
xfs_buf_relse(bp);
}
- xfs_perag_put(pag);
/* If the icount is impossible, make some worst-case assumptions. */
if (icount == NULLAGINO ||
- !xfs_verify_agino(mp, sm->sm_agno, icount)) {
- xfs_agino_t first, last;
-
- xfs_agino_range(mp, sm->sm_agno, &first, &last);
- icount = last - first + 1;
+ !xfs_verify_agino(pag, icount)) {
+ icount = pag->agino_max - pag->agino_min + 1;
}
/* If the block counts are impossible, make worst-case assumptions. */
if (aglen == NULLAGBLOCK ||
- aglen != xfs_ag_block_count(mp, sm->sm_agno) ||
+ aglen != pag->block_count ||
freelen >= aglen) {
- aglen = xfs_ag_block_count(mp, sm->sm_agno);
+ aglen = pag->block_count;
freelen = aglen;
usedlen = aglen;
}
+ xfs_perag_put(pag);
trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
freelen, usedlen);
@@ -300,13 +297,13 @@ xrep_alloc_ag_block(
switch (resv) {
case XFS_AG_RESV_AGFL:
case XFS_AG_RESV_RMAPBT:
- error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+ error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
+ sc->sa.agf_bp, &bno, 1);
if (error)
return error;
if (bno == NULLAGBLOCK)
return -ENOSPC;
- xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno,
- 1, false);
+ xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
if (resv == XFS_AG_RESV_RMAPBT)
xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
@@ -457,16 +454,19 @@ xrep_invalidate_blocks(
* assume it's owned by someone else.
*/
for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
+ int error;
+
/* Skip AG headers and post-EOFS blocks */
if (!xfs_verify_fsbno(sc->mp, fsbno))
continue;
- bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+ error = xfs_buf_incore(sc->mp->m_ddev_targp,
XFS_FSB_TO_DADDR(sc->mp, fsbno),
- XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
- if (bp) {
- xfs_trans_bjoin(sc->tp, bp);
- xfs_trans_binval(sc->tp, bp);
- }
+ XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
+ if (error)
+ continue;
+
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
}
return 0;
@@ -516,8 +516,8 @@ xrep_put_freelist(
return error;
/* Put the block on the AGFL. */
- error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
- agbno, 0);
+ error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+ sc->sa.agfl_bp, agbno, 0);
if (error)
return error;
xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
@@ -536,13 +536,12 @@ xrep_reap_block(
{
struct xfs_btree_cur *cur;
struct xfs_buf *agf_bp = NULL;
- xfs_agnumber_t agno;
xfs_agblock_t agbno;
bool has_other_rmap;
int error;
- agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
/*
* If we are repairing per-inode metadata, we need to read in the AGF
@@ -550,7 +549,7 @@ xrep_reap_block(
* the AGF buffer that the setup functions already grabbed.
*/
if (sc->ip) {
- error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
if (error)
return error;
} else {
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8dae0345c7df..229826b2e1c0 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -92,7 +92,7 @@ xchk_rmapbt_rec(
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_rmap_irec irec;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = bs->cur->bc_ag.pag;
bool non_inode;
bool is_unwritten;
bool is_bmbt;
@@ -121,8 +121,8 @@ xchk_rmapbt_rec(
* Otherwise we must point somewhere past the static metadata
* but before the end of the FS. Run the regular check.
*/
- if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) ||
- !xfs_verify_agbno(mp, agno, irec.rm_startblock +
+ if (!xfs_verify_agbno(pag, irec.rm_startblock) ||
+ !xfs_verify_agbno(pag, irec.rm_startblock +
irec.rm_blockcount - 1))
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 8fa012057405..0a3bde64c675 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -40,6 +40,7 @@ xchk_setup_rt(
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
xchk_rtbitmap_rec(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -48,10 +49,10 @@ xchk_rtbitmap_rec(
xfs_rtblock_t startblock;
xfs_rtblock_t blockcount;
- startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+ startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+ blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
- if (!xfs_verify_rtext(sc->mp, startblock, blockcount))
+ if (!xfs_verify_rtext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -114,7 +115,7 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index b11870d07c56..2e8e400f10a9 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -340,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
};
-/* This isn't a stable feature, warn once per day. */
-static inline void
-xchk_experimental_warning(
- struct xfs_mount *mp)
-{
- static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
- "xchk_warning", 86400 * HZ, 1);
- ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
-
- if (__ratelimit(&scrub_warning))
- xfs_alert(mp,
-"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
-}
-
static int
xchk_validate_inputs(
struct xfs_mount *mp,
@@ -478,7 +464,8 @@ xfs_scrub_metadata(
if (error)
goto out;
- xchk_experimental_warning(mp);
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
+ "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
if (!sc) {
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 599ee277bba2..75311f8daeeb 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -41,7 +41,7 @@ xchk_symlink(
if (!S_ISLNK(VFS_I(ip)->i_mode))
return -ENOENT;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
len = ip->i_disk_size;
/* Plausible size? */
@@ -52,8 +52,8 @@ xchk_symlink(
/* Inline symlink? */
if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
- if (len > XFS_IFORK_DSIZE(ip) ||
- len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip)))
+ if (len > xfs_inode_data_fork_size(ip) ||
+ len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
goto out;
}
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 5c52ee869272..b744c62052b6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -10,13 +10,14 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_acl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_trans.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl_xattr.h>
@@ -202,7 +203,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
xfs_acl_to_disk(args.value, acl);
}
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
kmem_free(args.value);
/*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index bb6abdcb265d..263404d0bfda 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
#else
-static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu)
+#define xfs_get_acl NULL
+#define xfs_set_acl NULL
+static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return NULL;
+ return 0;
}
-# define xfs_set_acl NULL
static inline void xfs_forget_acl(struct inode *inode, const char *name)
{
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 90b7f4d127de..5d1a995b15f8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -464,7 +464,7 @@ xfs_discard_folio(
int error;
if (xfs_is_shutdown(mp))
- goto out_invalidate;
+ return;
xfs_alert_ratelimited(mp,
"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
@@ -474,8 +474,6 @@ xfs_discard_folio(
i_blocks_per_folio(inode, folio) - pageoff_fsb);
if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
-out_invalidate:
- iomap_invalidate_folio(folio, offset, folio_size(folio) - offset);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
@@ -538,11 +536,11 @@ xfs_vm_bmap(
}
STATIC int
-xfs_vm_readpage(
+xfs_vm_read_folio(
struct file *unused,
- struct page *page)
+ struct folio *folio)
{
- return iomap_readpage(page, &xfs_read_iomap_ops);
+ return iomap_read_folio(folio, &xfs_read_iomap_ops);
}
STATIC void
@@ -564,15 +562,15 @@ xfs_iomap_swapfile_activate(
}
const struct address_space_operations xfs_address_space_operations = {
- .readpage = xfs_vm_readpage,
+ .read_folio = xfs_vm_read_folio,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
.dirty_folio = filemap_dirty_folio,
- .releasepage = iomap_releasepage,
+ .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
- .migratepage = iomap_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = xfs_iomap_swapfile_activate,
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 27265771f247..5db87b34fb6e 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -158,6 +158,7 @@ xfs_attr3_node_inactive(
}
child_fsb = be32_to_cpu(ichdr.btree[0].before);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
+ bp = NULL;
/*
* If this is the node level just above the leaves, simply loop
@@ -211,12 +212,8 @@ xfs_attr3_node_inactive(
&child_bp);
if (error)
return error;
- error = bp->b_error;
- if (error) {
- xfs_trans_brelse(*trans, child_bp);
- return error;
- }
xfs_trans_binval(*trans, child_bp);
+ child_bp = NULL;
/*
* If we're not done, re-read the parent to get the next
@@ -233,6 +230,7 @@ xfs_attr3_node_inactive(
bp->b_addr);
child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
+ bp = NULL;
}
/*
* Atomically commit the whole invalidate stuff.
@@ -338,7 +336,7 @@ xfs_attr_inactive(
ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, lock_mode);
- if (!XFS_IFORK_Q(dp))
+ if (!xfs_inode_has_attr_fork(dp))
goto out_destroy_fork;
xfs_iunlock(dp, lock_mode);
@@ -351,7 +349,7 @@ xfs_attr_inactive(
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(dp, lock_mode);
- if (!XFS_IFORK_Q(dp))
+ if (!xfs_inode_has_attr_fork(dp))
goto out_cancel;
/*
@@ -362,12 +360,11 @@ xfs_attr_inactive(
/*
* Invalidate and truncate the attribute fork extents. Make sure the
- * fork actually has attributes as otherwise the invalidation has no
+ * fork actually has xattr blocks as otherwise the invalidation has no
* blocks to read and returns an error. In this case, just do the fork
* removal below.
*/
- if (xfs_inode_hasattr(dp) &&
- dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_af.if_nextents > 0) {
error = xfs_attr3_root_inactive(&trans, dp);
if (error)
goto out_cancel;
@@ -388,11 +385,7 @@ out_cancel:
xfs_trans_cancel(trans);
out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
- if (dp->i_afp) {
- xfs_idestroy_fork(dp->i_afp);
- kmem_cache_free(xfs_ifork_cache, dp->i_afp);
- dp->i_afp = NULL;
- }
+ xfs_ifork_zap_attr(dp);
if (lock_mode)
xfs_iunlock(dp, lock_mode);
return error;
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
new file mode 100644
index 000000000000..5077a7ad5646
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.c
@@ -0,0 +1,888 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_space.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache *xfs_attri_cache;
+struct kmem_cache *xfs_attrd_cache;
+
+static const struct xfs_item_ops xfs_attri_item_ops;
+static const struct xfs_item_ops xfs_attrd_item_ops;
+static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip);
+
+static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attri_log_item, attri_item);
+}
+
+/*
+ * Shared xattr name/value buffers for logged extended attribute operations
+ *
+ * When logging updates to extended attributes, we can create quite a few
+ * attribute log intent items for a single xattr update. To avoid cycling the
+ * memory allocator and memcpy overhead, the name (and value, for setxattr)
+ * are kept in a refcounted object that is shared across all related log items
+ * and the upper-level deferred work state structure. The shared buffer has
+ * a control structure, followed by the name, and then the value.
+ */
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_get(
+ struct xfs_attri_log_nameval *nv)
+{
+ if (!refcount_inc_not_zero(&nv->refcount))
+ return NULL;
+ return nv;
+}
+
+static inline void
+xfs_attri_log_nameval_put(
+ struct xfs_attri_log_nameval *nv)
+{
+ if (!nv)
+ return;
+ if (refcount_dec_and_test(&nv->refcount))
+ kvfree(nv);
+}
+
+static inline struct xfs_attri_log_nameval *
+xfs_attri_log_nameval_alloc(
+ const void *name,
+ unsigned int name_len,
+ const void *value,
+ unsigned int value_len)
+{
+ struct xfs_attri_log_nameval *nv;
+
+ /*
+ * This could be over 64kB in length, so we have to use kvmalloc() for
+ * this. But kvmalloc() utterly sucks, so we use our own version.
+ */
+ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) +
+ name_len + value_len);
+ if (!nv)
+ return nv;
+
+ nv->name.i_addr = nv + 1;
+ nv->name.i_len = name_len;
+ nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
+ memcpy(nv->name.i_addr, name, name_len);
+
+ if (value_len) {
+ nv->value.i_addr = nv->name.i_addr + name_len;
+ nv->value.i_len = value_len;
+ memcpy(nv->value.i_addr, value, value_len);
+ } else {
+ nv->value.i_addr = NULL;
+ nv->value.i_len = 0;
+ }
+ nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
+
+ refcount_set(&nv->refcount, 1);
+ return nv;
+}
+
+STATIC void
+xfs_attri_item_free(
+ struct xfs_attri_log_item *attrip)
+{
+ kmem_free(attrip->attri_item.li_lv_shadow);
+ xfs_attri_log_nameval_put(attrip->attri_nameval);
+ kmem_cache_free(xfs_attri_cache, attrip);
+}
+
+/*
+ * Freeing the attrip requires that we remove it from the AIL if it has already
+ * been placed there. However, the ATTRI may not yet have been placed in the
+ * AIL when called by xfs_attri_release() from ATTRD processing due to the
+ * ordering of committed vs unpin operations in bulk insert operations. Hence
+ * the reference count to ensure only the last caller frees the ATTRI.
+ */
+STATIC void
+xfs_attri_release(
+ struct xfs_attri_log_item *attrip)
+{
+ ASSERT(atomic_read(&attrip->attri_refcount) > 0);
+ if (!atomic_dec_and_test(&attrip->attri_refcount))
+ return;
+
+ xfs_trans_ail_delete(&attrip->attri_item, 0);
+ xfs_attri_item_free(attrip);
+}
+
+STATIC void
+xfs_attri_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(nv->name.i_len);
+
+ if (!nv->value.i_len)
+ return;
+
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attri log
+ * item. We use 1 iovec for the attri_format_item, 1 for the name, and
+ * another for the value if it is present
+ */
+STATIC void
+xfs_attri_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+
+ attrip->attri_format.alfi_type = XFS_LI_ATTRI;
+ attrip->attri_format.alfi_size = 1;
+
+ /*
+ * This size accounting must be done before copying the attrip into the
+ * iovec. If we do it after, the wrong size will be recorded to the log
+ * and we trip across assertion checks for bad region sizes later during
+ * the log recovery.
+ */
+
+ ASSERT(nv->name.i_len > 0);
+ attrip->attri_format.alfi_size++;
+
+ if (nv->value.i_len > 0)
+ attrip->attri_format.alfi_size++;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
+ &attrip->attri_format,
+ sizeof(struct xfs_attri_log_format));
+ xlog_copy_from_iovec(lv, &vecp, &nv->name);
+ if (nv->value.i_len > 0)
+ xlog_copy_from_iovec(lv, &vecp, &nv->value);
+}
+
+/*
+ * The unpin operation is the last place an ATTRI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the ATTRI transaction has been successfully committed to make
+ * it this far. Therefore, we expect whoever committed the ATTRI to either
+ * construct and commit the ATTRD or drop the ATTRD's reference in the event of
+ * error. Simply drop the log's ATTRI reference now that the log is done with
+ * it.
+ */
+STATIC void
+xfs_attri_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+
+STATIC void
+xfs_attri_item_release(
+ struct xfs_log_item *lip)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an attri item. Caller may allocate an additional
+ * trailing buffer for name and value
+ */
+STATIC struct xfs_attri_log_item *
+xfs_attri_init(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_nameval *nv)
+{
+ struct xfs_attri_log_item *attrip;
+
+ attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ /*
+ * Grab an extra reference to the name/value buffer for this log item.
+ * The caller retains its own reference!
+ */
+ attrip->attri_nameval = xfs_attri_log_nameval_get(nv);
+ ASSERT(attrip->attri_nameval);
+
+ xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI,
+ &xfs_attri_item_ops);
+ attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip;
+ atomic_set(&attrip->attri_refcount, 2);
+
+ return attrip;
+}
+
+/*
+ * Copy an attr format buffer from the given buf, and into the destination attr
+ * format structure.
+ */
+STATIC int
+xfs_attri_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_attri_log_format *dst_attr_fmt)
+{
+ struct xfs_attri_log_format *src_attr_fmt = buf->i_addr;
+ size_t len;
+
+ len = sizeof(struct xfs_attri_log_format);
+ if (buf->i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len);
+ return 0;
+}
+
+static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attrd_log_item, attrd_item);
+}
+
+STATIC void
+xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
+{
+ kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kmem_cache_free(xfs_attrd_cache, attrdp);
+}
+
+STATIC void
+xfs_attrd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_attrd_log_format);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attrd log item. We use
+ * only 1 iovec for the attrd_format, and we point that at the attr_log_format
+ * structure embedded in the attrd item.
+ */
+STATIC void
+xfs_attrd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ attrdp->attrd_format.alfd_type = XFS_LI_ATTRD;
+ attrdp->attrd_format.alfd_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT,
+ &attrdp->attrd_format,
+ sizeof(struct xfs_attrd_log_format));
+}
+
+/*
+ * The ATTRD is either committed or aborted if the transaction is canceled. If
+ * the transaction is canceled, drop our reference to the ATTRI and free the
+ * ATTRD.
+ */
+STATIC void
+xfs_attrd_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+
+ xfs_attri_release(attrdp->attrd_attrip);
+ xfs_attrd_item_free(attrdp);
+}
+
+static struct xfs_log_item *
+xfs_attrd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
+}
+
+/*
+ * Performs one step of an attribute update intent and marks the attrd item
+ * dirty.. An attr operation may be a set or a remove. Note that the
+ * transaction is marked dirty regardless of whether the operation succeeds or
+ * fails to support the ATTRI/ATTRD lifecycle rules.
+ */
+STATIC int
+xfs_xattri_finish_update(
+ struct xfs_attr_intent *attr,
+ struct xfs_attrd_log_item *attrdp)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ error = -EAGAIN;
+out:
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the ATTRI and frees the ATTRD
+ * 2.) shuts down the filesystem
+ */
+ args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+
+ /*
+ * attr intent/done items are null when logged attributes are disabled
+ */
+ if (attrdp)
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ return error;
+}
+
+/* Log an attr to the intent item. */
+STATIC void
+xfs_attr_log_item(
+ struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip,
+ const struct xfs_attr_intent *attr)
+{
+ struct xfs_attri_log_format *attrp;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
+
+ /*
+ * At this point the xfs_attr_intent has been constructed, and we've
+ * created the log intent. Fill in the attri log item and log format
+ * structure with fields from this xfs_attr_intent
+ */
+ attrp = &attrip->attri_format;
+ attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+ ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
+ attrp->alfi_op_flags = attr->xattri_op_flags;
+ attrp->alfi_value_len = attr->xattri_nameval->value.i_len;
+ attrp->alfi_name_len = attr->xattri_nameval->name.i_len;
+ ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK));
+ attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter;
+}
+
+/* Get an ATTRI. */
+static struct xfs_log_item *
+xfs_attr_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attr_intent *attr;
+ struct xfs_da_args *args;
+
+ ASSERT(count == 1);
+
+ /*
+ * Each attr item only performs one attribute operation at a time, so
+ * this is a list of one
+ */
+ attr = list_first_entry_or_null(items, struct xfs_attr_intent,
+ xattri_list);
+ args = attr->xattri_da_args;
+
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ return NULL;
+
+ /*
+ * Create a buffer to store the attribute name and value. This buffer
+ * will be shared between the higher level deferred xattr work state
+ * and the lower level xattr log items.
+ */
+ if (!attr->xattri_nameval) {
+ /*
+ * Transfer our reference to the name/value buffer to the
+ * deferred work state structure.
+ */
+ attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name,
+ args->namelen, args->value, args->valuelen);
+ }
+ if (!attr->xattri_nameval)
+ return ERR_PTR(-ENOMEM);
+
+ attrip = xfs_attri_init(mp, attr->xattri_nameval);
+ xfs_trans_add_item(tp, &attrip->attri_item);
+ xfs_attr_log_item(tp, attrip, attr);
+
+ return &attrip->attri_item;
+}
+
+static inline void
+xfs_attr_free_item(
+ struct xfs_attr_intent *attr)
+{
+ if (attr->xattri_da_state)
+ xfs_da_state_free(attr->xattri_da_state);
+ xfs_attri_log_nameval_put(attr->xattri_nameval);
+ if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
+ kmem_free(attr);
+ else
+ kmem_cache_free(xfs_attr_intent_cache, attr);
+}
+
+/* Process an attr. */
+STATIC int
+xfs_attr_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_attr_intent *attr;
+ struct xfs_attrd_log_item *done_item = NULL;
+ int error;
+
+ attr = container_of(item, struct xfs_attr_intent, xattri_list);
+ if (done)
+ done_item = ATTRD_ITEM(done);
+
+ /*
+ * Always reset trans after EAGAIN cycle
+ * since the transaction is new
+ */
+ attr->xattri_da_args->trans = tp;
+
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error != -EAGAIN)
+ xfs_attr_free_item(attr);
+
+ return error;
+}
+
+/* Abort all pending ATTRs. */
+STATIC void
+xfs_attr_abort_intent(
+ struct xfs_log_item *intent)
+{
+ xfs_attri_release(ATTRI_ITEM(intent));
+}
+
+/* Cancel an attr */
+STATIC void
+xfs_attr_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_attr_intent *attr;
+
+ attr = container_of(item, struct xfs_attr_intent, xattri_list);
+ xfs_attr_free_item(attr);
+}
+
+STATIC bool
+xfs_attri_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
+}
+
+/* Is this recovered ATTRI format ok? */
+static inline bool
+xfs_attri_validate(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attrp)
+{
+ unsigned int op = attrp->alfi_op_flags &
+ XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+ if (attrp->__pad != 0)
+ return false;
+
+ if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)
+ return false;
+
+ if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK)
+ return false;
+
+ /* alfi_op_flags should be either a set or remove */
+ switch (op) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ break;
+ default:
+ return false;
+ }
+
+ if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+ return false;
+
+ if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+ (attrp->alfi_name_len == 0))
+ return false;
+
+ return xfs_verify_ino(mp, attrp->alfi_ino);
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attri_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_intent *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res tres;
+ struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+ int error;
+ int total;
+ int local;
+ struct xfs_attrd_log_item *done_item = NULL;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ return -EFSCORRUPTED;
+
+ error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ if (error)
+ return error;
+
+ attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
+ sizeof(struct xfs_da_args), KM_NOFS);
+ args = (struct xfs_da_args *)(attr + 1);
+
+ attr->xattri_da_args = args;
+ attr->xattri_op_flags = attrp->alfi_op_flags &
+ XFS_ATTRI_OP_FLAGS_TYPE_MASK;
+
+ /*
+ * We're reconstructing the deferred work state structure from the
+ * recovered log item. Grab a reference to the name/value buffer and
+ * attach it to the new work state.
+ */
+ attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
+ ASSERT(attr->xattri_nameval);
+
+ args->dp = ip;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->name = nv->name.i_addr;
+ args->namelen = nv->name.i_len;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+ args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
+ args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
+ XFS_DA_OP_LOGGED;
+
+ ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
+
+ switch (attr->xattri_op_flags) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ args->value = nv->value.i_addr;
+ args->valuelen = nv->value.i_len;
+ args->total = xfs_attr_calc_size(args, &local);
+ if (xfs_inode_hasattr(args->dp))
+ attr->xattri_dela_state = xfs_attr_init_replace_state(args);
+ else
+ attr->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ if (!xfs_inode_hasattr(args->dp))
+ goto out;
+ attr->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ xfs_init_attr_trans(args, &tres, &total);
+ error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ goto out;
+
+ args->trans = tp;
+ done_item = xfs_trans_get_attrd(tp, attrip);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error == -EAGAIN) {
+ /*
+ * There's more work to do, so add the intent item to this
+ * transaction so that we can continue it later.
+ */
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ if (error)
+ goto out_unlock;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ return 0;
+ }
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+out:
+ xfs_attr_free_item(attr);
+ return error;
+}
+
+/* Re-log an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_attri_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_attrd_log_item *attrdp;
+ struct xfs_attri_log_item *old_attrip;
+ struct xfs_attri_log_item *new_attrip;
+ struct xfs_attri_log_format *new_attrp;
+ struct xfs_attri_log_format *old_attrp;
+
+ old_attrip = ATTRI_ITEM(intent);
+ old_attrp = &old_attrip->attri_format;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ attrdp = xfs_trans_get_attrd(tp, old_attrip);
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ /*
+ * Create a new log item that shares the same name/value buffer as the
+ * old log item.
+ */
+ new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval);
+ new_attrp = &new_attrip->attri_format;
+
+ new_attrp->alfi_ino = old_attrp->alfi_ino;
+ new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
+ new_attrp->alfi_value_len = old_attrp->alfi_value_len;
+ new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
+
+ xfs_trans_add_item(tp, &new_attrip->attri_item);
+ set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
+
+ return &new_attrip->attri_item;
+}
+
+STATIC int
+xlog_recover_attri_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attri_log_format *attri_formatp;
+ struct xfs_attri_log_nameval *nv;
+ const void *attr_value = NULL;
+ const void *attr_name;
+ int error;
+
+ attri_formatp = item->ri_buf[0].i_addr;
+ attr_name = item->ri_buf[1].i_addr;
+
+ /* Validate xfs_attri_log_format before the large memory allocation */
+ if (!xfs_attri_validate(mp, attri_formatp)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (attri_formatp->alfi_value_len)
+ attr_value = item->ri_buf[2].i_addr;
+
+ /*
+ * Memory alloc failure will cause replay to abort. We attach the
+ * name/value buffer to the recovered incore log item and drop our
+ * reference.
+ */
+ nv = xfs_attri_log_nameval_alloc(attr_name,
+ attri_formatp->alfi_name_len, attr_value,
+ attri_formatp->alfi_value_len);
+ if (!nv)
+ return -ENOMEM;
+
+ attrip = xfs_attri_init(mp, nv);
+ error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format);
+ if (error)
+ goto out;
+
+ /*
+ * The ATTRI has two references. One for the ATTRD and one for ATTRI to
+ * ensure it makes it into the AIL. Insert the ATTRI into the AIL
+ * directly and drop the ATTRI reference. Note that
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
+ xfs_attri_release(attrip);
+ xfs_attri_log_nameval_put(nv);
+ return 0;
+out:
+ xfs_attri_item_free(attrip);
+ xfs_attri_log_nameval_put(nv);
+ return error;
+}
+
+/*
+ * This routine is called to allocate an "attr free done" log item.
+ */
+static struct xfs_attrd_log_item *
+xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip)
+{
+ struct xfs_attrd_log_item *attrdp;
+
+ ASSERT(tp != NULL);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ xfs_trans_add_item(tp, &attrdp->attrd_item);
+ return attrdp;
+}
+
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ if (!intent)
+ return NULL;
+
+ return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+};
+
+/*
+ * This routine is called when an ATTRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
+ * it was still in the log. To do this it searches the AIL for the ATTRI with
+ * an id equal to that in the ATTRD format structure. If we find it we drop
+ * the ATTRD reference, which removes the ATTRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_attrd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_attrd_log_format *attrd_formatp;
+
+ attrd_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_ATTRI,
+ attrd_formatp->alfd_alf_id);
+ return 0;
+}
+
+static const struct xfs_item_ops xfs_attri_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_attri_item_size,
+ .iop_format = xfs_attri_item_format,
+ .iop_unpin = xfs_attri_item_unpin,
+ .iop_release = xfs_attri_item_release,
+ .iop_recover = xfs_attri_item_recover,
+ .iop_match = xfs_attri_item_match,
+ .iop_relog = xfs_attri_item_relog,
+};
+
+const struct xlog_recover_item_ops xlog_attri_item_ops = {
+ .item_type = XFS_LI_ATTRI,
+ .commit_pass2 = xlog_recover_attri_commit_pass2,
+};
+
+static const struct xfs_item_ops xfs_attrd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
+ .iop_size = xfs_attrd_item_size,
+ .iop_format = xfs_attrd_item_format,
+ .iop_release = xfs_attrd_item_release,
+ .iop_intent = xfs_attrd_item_intent,
+};
+
+const struct xlog_recover_item_ops xlog_attrd_item_ops = {
+ .item_type = XFS_LI_ATTRD,
+ .commit_pass2 = xlog_recover_attrd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
new file mode 100644
index 000000000000..3280a7930287
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+#ifndef __XFS_ATTR_ITEM_H__
+#define __XFS_ATTR_ITEM_H__
+
+/* kernel only ATTRI/ATTRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+struct xfs_attri_log_nameval {
+ struct xfs_log_iovec name;
+ struct xfs_log_iovec value;
+ refcount_t refcount;
+
+ /* name and value follow the end of this struct */
+};
+
+/*
+ * This is the "attr intention" log item. It is used to log the fact that some
+ * extended attribute operations need to be processed. An operation is
+ * currently either a set or remove. Set or remove operations are described by
+ * the xfs_attr_intent which may be logged to this intent.
+ *
+ * During a normal attr operation, name and value point to the name and value
+ * fields of the caller's xfs_da_args structure. During a recovery, the name
+ * and value buffers are copied from the log, and stored in a trailing buffer
+ * attached to the xfs_attr_intent until they are committed. They are freed
+ * when the xfs_attr_intent itself is freed when the work is done.
+ */
+struct xfs_attri_log_item {
+ struct xfs_log_item attri_item;
+ atomic_t attri_refcount;
+ struct xfs_attri_log_nameval *attri_nameval;
+ struct xfs_attri_log_format attri_format;
+};
+
+/*
+ * This is the "attr done" log item. It is used to log the fact that some attrs
+ * earlier mentioned in an attri item have been freed.
+ */
+struct xfs_attrd_log_item {
+ struct xfs_log_item attrd_item;
+ struct xfs_attri_log_item *attrd_attrip;
+ struct xfs_attrd_log_format attrd_format;
+};
+
+extern struct kmem_cache *xfs_attri_cache;
+extern struct kmem_cache *xfs_attrd_cache;
+
+#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d1e5134cebe..99bbbe1a0e44 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -15,6 +15,7 @@
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_sf.h"
#include "xfs_attr_leaf.h"
@@ -60,8 +61,7 @@ xfs_attr_shortform_list(
int sbsize, nsbuf, count, i;
int error = 0;
- ASSERT(dp->i_afp != NULL);
- sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
return 0;
@@ -79,7 +79,7 @@ xfs_attr_shortform_list(
*/
if (context->bufsize == 0 ||
(XFS_ISRESET_CURSOR(cursor) &&
- (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+ (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
@@ -120,7 +120,7 @@ xfs_attr_shortform_list(
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
- ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+ ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe,
@@ -512,7 +512,7 @@ xfs_attr_list_ilocked(
*/
if (!xfs_inode_hasattr(dp))
return 0;
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_list(context);
if (xfs_attr_is_leaf(dp))
return xfs_attr_leaf_list(context);
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index ae4345b37621..fe21c76f75b8 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -15,7 +15,7 @@ xfs_rw_bdev(
sector_t sector,
unsigned int count,
char *data,
- unsigned int op)
+ enum req_op op)
{
unsigned int is_vmalloc = is_vmalloc_addr(data);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 761dde155099..51f66e982484 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -39,6 +39,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_cache, buip);
}
@@ -54,10 +55,11 @@ xfs_bui_release(
struct xfs_bui_log_item *buip)
{
ASSERT(atomic_read(&buip->bui_refcount) > 0);
- if (atomic_dec_and_test(&buip->bui_refcount)) {
- xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_bui_item_free(buip);
- }
+ if (!atomic_dec_and_test(&buip->bui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&buip->bui_item, 0);
+ xfs_bui_item_free(buip);
}
@@ -198,14 +200,24 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_cache, budp);
}
+static struct xfs_log_item *
+xfs_bud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &BUD_ITEM(lip)->bud_buip->bui_item;
+}
+
static const struct xfs_item_ops xfs_bud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_bud_item_size,
.iop_format = xfs_bud_item_format,
.iop_release = xfs_bud_item_release,
+ .iop_intent = xfs_bud_item_intent,
};
static struct xfs_bud_log_item *
@@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update(
* 1.) releases the BUI and frees the BUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
return error;
@@ -506,6 +518,8 @@ xfs_bui_item_recover(
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
@@ -584,6 +598,7 @@ xfs_bui_item_relog(
}
static const struct xfs_item_ops xfs_bui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
.iop_unpin = xfs_bui_item_unpin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index eb2e387ba528..04d0c2bff67c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -119,14 +119,14 @@ retry:
*/
ralen = ap->length / mp->m_sb.sb_rextsize;
/*
- * If the old value was close enough to MAXEXTLEN that
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
- * MAXEXTLEN), we don't hear about that number, and can't
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
- ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+ if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
+ ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
/*
* Lock out modifications to both the RT bitmap and summary inodes
@@ -256,7 +256,7 @@ xfs_bmap_count_blocks(
xfs_filblks_t *count)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
xfs_extlen_t btblocks = 0;
int error;
@@ -439,29 +439,28 @@ xfs_getbmap(
whichfork = XFS_COW_FORK;
else
whichfork = XFS_DATA_FORK;
- ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
switch (whichfork) {
case XFS_ATTR_FORK:
- if (!XFS_IFORK_Q(ip))
- goto out_unlock_iolock;
+ lock = xfs_ilock_attr_map_shared(ip);
+ if (!xfs_inode_has_attr_fork(ip))
+ goto out_unlock_ilock;
max_len = 1LL << 32;
- lock = xfs_ilock_attr_map_shared(ip);
break;
case XFS_COW_FORK:
+ lock = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lock);
+
/* No CoW fork? Just return */
- if (!ifp)
- goto out_unlock_iolock;
+ if (!xfs_ifork_ptr(ip, whichfork))
+ goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip))
max_len = mp->m_super->s_maxbytes;
else
max_len = XFS_ISIZE(ip);
-
- lock = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lock);
break;
case XFS_DATA_FORK:
if (!(iflags & BMV_IF_DELALLOC) &&
@@ -491,6 +490,8 @@ xfs_getbmap(
break;
}
+ ifp = xfs_ifork_ptr(ip, whichfork);
+
switch (ifp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -686,6 +687,8 @@ xfs_can_free_eofblocks(
* forever.
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+ end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
@@ -839,9 +842,11 @@ xfs_alloc_file_space(
* count, hence we need to limit the number of blocks we are
* trying to reserve to avoid an overflow. We can't allocate
* more than @nimaps extents, and an extent is limited on disk
- * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+ * limit.
*/
- resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ resblks = min_t(xfs_fileoff_t, (e - s),
+ (XFS_MAX_BMBT_EXTLEN * nimaps));
if (unlikely(rt)) {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
rblocks = resblks;
@@ -857,6 +862,9 @@ xfs_alloc_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto error;
@@ -912,6 +920,8 @@ xfs_unmap_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1193,6 +1203,8 @@ xfs_insert_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1309,8 +1321,8 @@ xfs_swap_extents_check_format(
* extent format...
*/
if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_Q(ip) &&
- XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip))
+ if (xfs_inode_has_attr_fork(ip) &&
+ XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
return -EINVAL;
if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return -EINVAL;
@@ -1318,8 +1330,8 @@ xfs_swap_extents_check_format(
/* Reciprocal target->temp btree format checks */
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
- if (XFS_IFORK_Q(tip) &&
- XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+ if (xfs_inode_has_attr_fork(tip) &&
+ XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
return -EINVAL;
if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return -EINVAL;
@@ -1421,6 +1433,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(ip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
@@ -1429,6 +1444,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(tip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
@@ -1489,15 +1507,15 @@ xfs_swap_extent_forks(
/*
* Count the number of extended attribute blocks
*/
- if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 &&
- ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+ if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
&aforkblks);
if (error)
return error;
}
- if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 &&
- tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
+ if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
+ tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
&taforkblks);
if (error)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e1afb9e503e1..dde346450952 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -5,6 +5,7 @@
*/
#include "xfs.h"
#include <linux/backing-dev.h>
+#include <linux/dax.h>
#include "xfs_shared.h"
#include "xfs_format.h"
@@ -21,7 +22,7 @@
#include "xfs_error.h"
#include "xfs_ag.h"
-static struct kmem_cache *xfs_buf_cache;
+struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
@@ -295,6 +296,16 @@ xfs_buf_free_pages(
}
static void
+xfs_buf_free_callback(
+ struct callback_head *cb)
+{
+ struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
+
+ xfs_buf_free_maps(bp);
+ kmem_cache_free(xfs_buf_cache, bp);
+}
+
+static void
xfs_buf_free(
struct xfs_buf *bp)
{
@@ -307,8 +318,7 @@ xfs_buf_free(
else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
- xfs_buf_free_maps(bp);
- kmem_cache_free(xfs_buf_cache, bp);
+ call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
@@ -406,7 +416,7 @@ xfs_buf_alloc_pages(
STATIC int
_xfs_buf_map_pages(
struct xfs_buf *bp,
- uint flags)
+ xfs_buf_flags_t flags)
{
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
@@ -503,100 +513,45 @@ xfs_buf_hash_destroy(
rhashtable_destroy(&pag->pag_buf_hash);
}
-/*
- * Look up a buffer in the buffer cache and return it referenced and locked
- * in @found_bp.
- *
- * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
- * cache.
- *
- * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
- * -EAGAIN if we fail to lock it.
- *
- * Return values are:
- * -EFSCORRUPTED if have been supplied with an invalid address
- * -EAGAIN on trylock failure
- * -ENOENT if we fail to find a match and @new_bp was NULL
- * 0, with @found_bp:
- * - @new_bp if we inserted it into the cache
- * - the buffer we found and locked.
- */
static int
-xfs_buf_find(
+xfs_buf_map_verify(
struct xfs_buftarg *btp,
- struct xfs_buf_map *map,
- int nmaps,
- xfs_buf_flags_t flags,
- struct xfs_buf *new_bp,
- struct xfs_buf **found_bp)
+ struct xfs_buf_map *map)
{
- struct xfs_perag *pag;
- struct xfs_buf *bp;
- struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
- int i;
-
- *found_bp = NULL;
-
- for (i = 0; i < nmaps; i++)
- cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
- ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
+ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
+ if (map->bm_bn < 0 || map->bm_bn >= eofs) {
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
- __func__, cmap.bm_bn, eofs);
+ __func__, map->bm_bn, eofs);
WARN_ON(1);
return -EFSCORRUPTED;
}
-
- pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
-
- spin_lock(&pag->pag_buf_lock);
- bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
- xfs_buf_hash_params);
- if (bp) {
- atomic_inc(&bp->b_hold);
- goto found;
- }
-
- /* No match found */
- if (!new_bp) {
- XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
- return -ENOENT;
- }
-
- /* the buffer keeps the perag reference until it is freed */
- new_bp->b_pag = pag;
- rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- *found_bp = new_bp;
return 0;
+}
-found:
- spin_unlock(&pag->pag_buf_lock);
- xfs_perag_put(pag);
-
- if (!xfs_buf_trylock(bp)) {
- if (flags & XBF_TRYLOCK) {
- xfs_buf_rele(bp);
- XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
+static int
+xfs_buf_find_lock(
+ struct xfs_buf *bp,
+ xfs_buf_flags_t flags)
+{
+ if (flags & XBF_TRYLOCK) {
+ if (!xfs_buf_trylock(bp)) {
+ XFS_STATS_INC(bp->b_mount, xb_busy_locked);
return -EAGAIN;
}
+ } else {
xfs_buf_lock(bp);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
+ XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
}
/*
@@ -609,57 +564,59 @@ found:
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
}
-
- trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(btp->bt_mount, xb_get_locked);
- *found_bp = bp;
return 0;
}
-struct xfs_buf *
-xfs_buf_incore(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
+static inline int
+xfs_buf_lookup(
+ struct xfs_perag *pag,
+ struct xfs_buf_map *map,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
+ struct xfs_buf *bp;
int error;
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
- if (error)
- return NULL;
- return bp;
+ rcu_read_lock();
+ bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+ if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ error = xfs_buf_find_lock(bp, flags);
+ if (error) {
+ xfs_buf_rele(bp);
+ return error;
+ }
+
+ trace_xfs_buf_find(bp, flags, _RET_IP_);
+ *bpp = bp;
+ return 0;
}
/*
- * Assembles a buffer covering the specified range. The code is optimised for
- * cache hits, as metadata intensive workloads will see 3 orders of magnitude
- * more hits than misses.
+ * Insert the new_bp into the hash table. This consumes the perag reference
+ * taken for the lookup regardless of the result of the insert.
*/
-int
-xfs_buf_get_map(
- struct xfs_buftarg *target,
+static int
+xfs_buf_find_insert(
+ struct xfs_buftarg *btp,
+ struct xfs_perag *pag,
+ struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
- struct xfs_buf *bp;
struct xfs_buf *new_bp;
+ struct xfs_buf *bp;
int error;
- *bpp = NULL;
- error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
- if (!error)
- goto found;
- if (error != -ENOENT)
- return error;
-
- error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
+ error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
- return error;
+ goto out_drop_pag;
/*
* For buffers that fit entirely within a single page, first attempt to
@@ -674,18 +631,94 @@ xfs_buf_get_map(
goto out_free_buf;
}
- error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
- if (error)
+ spin_lock(&pag->pag_buf_lock);
+ bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+ &new_bp->b_rhash_head, xfs_buf_hash_params);
+ if (IS_ERR(bp)) {
+ error = PTR_ERR(bp);
+ spin_unlock(&pag->pag_buf_lock);
goto out_free_buf;
+ }
+ if (bp) {
+ /* found an existing buffer */
+ atomic_inc(&bp->b_hold);
+ spin_unlock(&pag->pag_buf_lock);
+ error = xfs_buf_find_lock(bp, flags);
+ if (error)
+ xfs_buf_rele(bp);
+ else
+ *bpp = bp;
+ goto out_free_buf;
+ }
- if (bp != new_bp)
- xfs_buf_free(new_bp);
+ /* The new buffer keeps the perag reference until it is freed. */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
+ *bpp = new_bp;
+ return 0;
+
+out_free_buf:
+ xfs_buf_free(new_bp);
+out_drop_pag:
+ xfs_perag_put(pag);
+ return error;
+}
-found:
+/*
+ * Assembles a buffer covering the specified range. The code is optimised for
+ * cache hits, as metadata intensive workloads will see 3 orders of magnitude
+ * more hits than misses.
+ */
+int
+xfs_buf_get_map(
+ struct xfs_buftarg *btp,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
+{
+ struct xfs_perag *pag;
+ struct xfs_buf *bp = NULL;
+ struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
+ int error;
+ int i;
+
+ for (i = 0; i < nmaps; i++)
+ cmap.bm_len += map[i].bm_len;
+
+ error = xfs_buf_map_verify(btp, &cmap);
+ if (error)
+ return error;
+
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+
+ error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+ if (error && error != -ENOENT)
+ goto out_put_perag;
+
+ /* cache hits always outnumber misses by at least 10:1 */
+ if (unlikely(!bp)) {
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
+
+ if (flags & XBF_INCORE)
+ goto out_put_perag;
+
+ /* xfs_buf_find_insert() consumes the perag reference. */
+ error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+ flags, &bp);
+ if (error)
+ return error;
+ } else {
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+ xfs_perag_put(pag);
+ }
+
+ /* We do not hold a perag reference anymore. */
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
- xfs_warn_ratelimited(target->bt_mount,
+ xfs_warn_ratelimited(btp->bt_mount,
"%s: failed to map %u pages", __func__,
bp->b_page_count);
xfs_buf_relse(bp);
@@ -700,12 +733,13 @@ found:
if (!(flags & XBF_READ))
xfs_buf_ioerror(bp, 0);
- XFS_STATS_INC(target->bt_mount, xb_get);
+ XFS_STATS_INC(btp->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
*bpp = bp;
return 0;
-out_free_buf:
- xfs_buf_free(new_bp);
+
+out_put_perag:
+ xfs_perag_put(pag);
return error;
}
@@ -868,7 +902,7 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@@ -903,7 +937,7 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- int flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
int error;
@@ -1416,7 +1450,7 @@ xfs_buf_ioapply_map(
int map,
int *buf_offset,
int *count,
- int op)
+ blk_opf_t op)
{
int page_index;
unsigned int total_nr_pages = bp->b_page_count;
@@ -1493,7 +1527,7 @@ _xfs_buf_ioapply(
struct xfs_buf *bp)
{
struct blk_plug plug;
- int op;
+ blk_opf_t op;
int offset;
int size;
int i;
@@ -1911,7 +1945,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
- fs_put_dax(btp->bt_daxdev);
+ fs_put_dax(btp->bt_daxdev, btp->bt_mount);
kmem_free(btp);
}
@@ -1958,13 +1992,18 @@ xfs_alloc_buftarg(
struct block_device *bdev)
{
xfs_buftarg_t *btp;
+ const struct dax_holder_operations *ops = NULL;
+#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
+ ops = &xfs_dax_holder_operations;
+#endif
btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
+ mp, ops);
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
@@ -1986,7 +2025,8 @@ xfs_alloc_buftarg(
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
- if (register_shrinker(&btp->bt_shrinker))
+ if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
+ mp->m_super->s_id))
goto error_pcpu;
return btp;
@@ -2275,29 +2315,6 @@ xfs_buf_delwri_pushbuf(
return error;
}
-int __init
-xfs_buf_init(void)
-{
- xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
- SLAB_HWCACHE_ALIGN |
- SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD,
- NULL);
- if (!xfs_buf_cache)
- goto out;
-
- return 0;
-
- out:
- return -ENOMEM;
-}
-
-void
-xfs_buf_terminate(void)
-{
- kmem_cache_destroy(xfs_buf_cache);
-}
-
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index edcb6254fa6a..549c60942208 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -15,6 +15,8 @@
#include <linux/uio.h>
#include <linux/list_lru.h>
+extern struct kmem_cache *xfs_buf_cache;
+
/*
* Base types
*/
@@ -22,28 +24,30 @@ struct xfs_buf;
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-#define XBF_READ (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
-#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
+#define XBF_READ (1u << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
+#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
+#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */
/* buffer type flags for write callbacks */
-#define _XBF_INODES (1 << 16)/* inode buffer */
-#define _XBF_DQUOTS (1 << 17)/* dquot buffer */
-#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */
+#define _XBF_INODES (1u << 16)/* inode buffer */
+#define _XBF_DQUOTS (1u << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
-#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
+#define _XBF_KMEM (1u << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */
+#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
+#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
+
typedef unsigned int xfs_buf_flags_t;
@@ -58,11 +62,12 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_INODES, "INODES" }, \
{ _XBF_DQUOTS, "DQUOTS" }, \
- { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
+ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
+ { XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, \
{ XBF_UNMAPPED, "UNMAPPED" }
@@ -193,13 +198,10 @@ struct xfs_buf {
int b_last_error;
const struct xfs_buf_ops *b_ops;
+ struct rcu_head b_rcu;
};
/* Finding and Reading Buffers */
-struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
- xfs_daddr_t blkno, size_t numblks,
- xfs_buf_flags_t flags);
-
int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
@@ -210,6 +212,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target,
const struct xfs_buf_ops *ops);
static inline int
+xfs_buf_incore(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
+{
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+ return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp);
+}
+
+static inline int
xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
@@ -247,11 +262,11 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
- struct xfs_buf **bpp);
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
+ xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
- size_t numblks, int flags, struct xfs_buf **bpp,
- const struct xfs_buf_ops *ops);
+ size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops);
int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
@@ -294,10 +309,6 @@ extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
-/* Buffer Daemon Setup Routines */
-extern int xfs_buf_init(void);
-extern void xfs_buf_terminate(void);
-
static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
return bp->b_maps[0].bm_bn;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index e11e9ef2338f..4d8a6aece995 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -8,15 +8,18 @@
/* kernel only definitions */
+struct xfs_buf;
+struct xfs_mount;
+
/* buf log item flags */
-#define XFS_BLI_HOLD 0x01
-#define XFS_BLI_DIRTY 0x02
-#define XFS_BLI_STALE 0x04
-#define XFS_BLI_LOGGED 0x08
-#define XFS_BLI_INODE_ALLOC_BUF 0x10
-#define XFS_BLI_STALE_INODE 0x20
-#define XFS_BLI_INODE_BUF 0x40
-#define XFS_BLI_ORDERED 0x80
+#define XFS_BLI_HOLD (1u << 0)
+#define XFS_BLI_DIRTY (1u << 1)
+#define XFS_BLI_STALE (1u << 2)
+#define XFS_BLI_LOGGED (1u << 3)
+#define XFS_BLI_INODE_ALLOC_BUF (1u << 4)
+#define XFS_BLI_STALE_INODE (1u << 5)
+#define XFS_BLI_INODE_BUF (1u << 6)
+#define XFS_BLI_ORDERED (1u << 7)
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -28,11 +31,6 @@
{ XFS_BLI_INODE_BUF, "INODE_BUF" }, \
{ XFS_BLI_ORDERED, "ORDERED" }
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_buf_log_item;
-
/*
* This is the in core log item structure used to track information
* needed to log buffers. It tracks how many times the lock has been
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index e484251dc9c8..ffa94102094d 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -24,6 +24,15 @@
#include "xfs_quota.h"
/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE 64
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
+/*
* This structure is used during recovery to record the buf log items which
* have been canceled and should not be replayed.
*/
@@ -993,3 +1002,60 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = {
.commit_pass1 = xlog_recover_buf_commit_pass1,
.commit_pass2 = xlog_recover_buf_commit_pass2,
};
+
+#ifdef DEBUG
+void
+xlog_check_buf_cancel_table(
+ struct xlog *log)
+{
+ int i;
+
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+}
+#endif
+
+int
+xlog_alloc_buf_cancel_table(
+ struct xlog *log)
+{
+ void *p;
+ int i;
+
+ ASSERT(log->l_buf_cancel_table == NULL);
+
+ p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head),
+ GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ log->l_buf_cancel_table = p;
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+ return 0;
+}
+
+void
+xlog_free_buf_cancel_table(
+ struct xlog *log)
+{
+ int i;
+
+ if (!log->l_buf_cancel_table)
+ return;
+
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) {
+ struct xfs_buf_cancel *bc;
+
+ while ((bc = list_first_entry_or_null(
+ &log->l_buf_cancel_table[i],
+ struct xfs_buf_cancel, bc_list))) {
+ list_del(&bc->bc_list);
+ kmem_free(bc);
+ }
+ }
+
+ kmem_free(log->l_buf_cancel_table);
+ log->l_buf_cancel_table = NULL;
+}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index a7174a5b3203..e295fc8062d8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -248,7 +248,7 @@ xfs_dir2_leaf_readbuf(
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL;
struct xfs_da_geometry *geo = args->geo;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK);
struct xfs_bmbt_irec map;
struct blk_plug plug;
xfs_dir2_off_t new_off;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 0191de8ce9ce..bfc829c07f03 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -45,7 +45,7 @@ xfs_trim_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
if (error)
goto out_put_perag;
agf = agbp->b_addr;
@@ -114,7 +114,7 @@ xfs_trim_extents(
}
trace_xfs_discard_extent(mp, agno, fbno, flen);
- error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+ error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
if (error)
goto out_del_cursor;
*blocks_trimmed += flen;
@@ -152,8 +152,8 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
- unsigned int granularity = q->limits.discard_granularity;
+ unsigned int granularity =
+ bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
xfs_agnumber_t start_agno, end_agno, agno;
@@ -162,7 +162,7 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev))
return -EOPNOTSUPP;
/*
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5afedcbc78c7..8fb90da89787 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer(
res->timer = xfs_dquot_set_timeout(mp,
ktime_get_real_seconds() + qlim->time);
} else {
- if (res->timer == 0)
- res->warnings = 0;
- else
- res->timer = 0;
+ res->timer = 0;
}
}
@@ -322,6 +319,9 @@ xfs_dquot_disk_alloc(
error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, quotip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto err_cancel;
@@ -549,7 +549,7 @@ xfs_dquot_check_type(
* at the same time. The non-user quota file can be switched between
* group and project quota uses depending on the mount options, which
* means that we can encounter the other type when we try to load quota
- * defaults. Quotacheck will soon reset the the entire quota file
+ * defaults. Quotacheck will soon reset the entire quota file
* (including the root dquot) anyway, but don't log scary corruption
* reports to dmesg.
*/
@@ -589,10 +589,6 @@ xfs_dquot_from_disk(
dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
- dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
- dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
- dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
-
dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
@@ -634,9 +630,9 @@ xfs_dquot_to_disk(
ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
- ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
- ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
- ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
+ ddqp->d_bwarns = 0;
+ ddqp->d_iwarns = 0;
+ ddqp->d_rtbwarns = 0;
ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 6b5e3cf40c8b..80c8f851a2f3 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -44,14 +44,6 @@ struct xfs_dquot_res {
* in seconds since the Unix epoch.
*/
time64_t timer;
-
- /*
- * For root dquots, this is the maximum number of warnings that will
- * be issued for this quota type. Otherwise, this is the number of
- * warnings issued against this quota. Note that none of this is
- * implemented.
- */
- xfs_qwarncnt_t warnings;
};
static inline bool
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 749fd18c4f32..296faa41d81d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_REDUCE_MAX_IEXTENTS,
XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
XFS_RANDOM_AG_RESV_FAIL,
+ XFS_RANDOM_LARP,
+ XFS_RANDOM_DA_LEAF_SPLIT,
+ XFS_RANDOM_ATTR_LEAF_TO_NODE,
};
struct xfs_errortag_attr {
@@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
+XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
+XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
+XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -211,6 +217,9 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
+ XFS_ERRORTAG_ATTR_LIST(larp),
+ XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
+ XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5735d5ea87ee..5191e9145e55 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
* XFS panic tags -- allow a call to xfs_alert_tag() be turned into
* a panic by setting xfs_panic_mask in a sysctl.
*/
-#define XFS_NO_PTAG 0
-#define XFS_PTAG_IFLUSH 0x00000001
-#define XFS_PTAG_LOGRES 0x00000002
-#define XFS_PTAG_AILDELETE 0x00000004
-#define XFS_PTAG_ERROR_REPORT 0x00000008
-#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
-#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
-#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
-#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
-#define XFS_PTAG_VERIFIER_ERROR 0x00000100
+#define XFS_NO_PTAG 0u
+#define XFS_PTAG_IFLUSH (1u << 0)
+#define XFS_PTAG_LOGRES (1u << 1)
+#define XFS_PTAG_AILDELETE (1u << 2)
+#define XFS_PTAG_ERROR_REPORT (1u << 3)
+#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4)
+#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5)
+#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6)
+#define XFS_PTAG_FSBLOCK_ZERO (1u << 7)
+#define XFS_PTAG_VERIFIER_ERROR (1u << 8)
#define XFS_PTAG_STRINGS \
{ XFS_NO_PTAG, "none" }, \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 0e50f2c9348e..27ccfcd82f04 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -11,6 +11,7 @@
#include "xfs_bit.h"
#include "xfs_shared.h"
#include "xfs_mount.h"
+#include "xfs_ag.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -58,10 +59,11 @@ xfs_efi_release(
struct xfs_efi_log_item *efip)
{
ASSERT(atomic_read(&efip->efi_refcount) > 0);
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
+ if (!atomic_dec_and_test(&efip->efi_refcount))
+ return;
+
+ xfs_trans_ail_delete(&efip->efi_item, 0);
+ xfs_efi_item_free(efip);
}
/*
@@ -186,12 +188,12 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
uint i;
- uint len = sizeof(xfs_efi_log_format_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
- uint len32 = sizeof(xfs_efi_log_format_32_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
- uint len64 = sizeof(xfs_efi_log_format_64_t) +
- (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
+ uint len = sizeof(xfs_efi_log_format_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
+ uint len32 = sizeof(xfs_efi_log_format_32_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
+ uint len64 = sizeof(xfs_efi_log_format_64_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
if (buf->i_len == len) {
memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
@@ -306,11 +308,20 @@ xfs_efd_item_release(
xfs_efd_item_free(efdp);
}
+static struct xfs_log_item *
+xfs_efd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &EFD_ITEM(lip)->efd_efip->efi_item;
+}
+
static const struct xfs_item_ops xfs_efd_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
.iop_release = xfs_efd_item_release,
+ .iop_intent = xfs_efd_item_intent,
};
/*
@@ -380,7 +391,7 @@ xfs_trans_free_extent(
* 1.) releases the EFI and frees the EFD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
next_extent = efdp->efd_next_extent;
@@ -541,6 +552,7 @@ xfs_agfl_free_finish_item(
xfs_agnumber_t agno;
xfs_agblock_t agbno;
uint next_extent;
+ struct xfs_perag *pag;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
ASSERT(free->xefi_blockcount == 1);
@@ -550,9 +562,11 @@ xfs_agfl_free_finish_item(
trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (!error)
error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo);
+ xfs_perag_put(pag);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -688,6 +702,7 @@ xfs_efi_item_relog(
}
static const struct xfs_item_ops xfs_efi_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_unpin = xfs_efi_item_unpin,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5bddb1e9e0b3..c6c80265c0b2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
+#include <linux/dax.h>
#include <linux/falloc.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
@@ -142,7 +143,7 @@ xfs_file_fsync(
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
- int error = 0;
+ int error, err2;
int log_flushed = 0;
trace_xfs_file_fsync(ip);
@@ -163,18 +164,21 @@ xfs_file_fsync(
* inode size in case of an extending write.
*/
if (XFS_IS_REALTIME_INODE(ip))
- blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
+ error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
- blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
* Any inode that has dirty modifications in the log is pinned. The
- * racy check here for a pinned inode while not catch modifications
+ * racy check here for a pinned inode will not catch modifications
* that happen concurrently to the fsync call, but fsync semantics
* only require to sync previously completed I/O.
*/
- if (xfs_ipincount(ip))
- error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+ if (xfs_ipincount(ip)) {
+ err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
+ if (err2 && !error)
+ error = err2;
+ }
/*
* If we only have a single device, and the log force about was
@@ -184,8 +188,11 @@ xfs_file_fsync(
* commit.
*/
if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
- mp->m_logdev_targp == mp->m_ddev_targp)
- blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ mp->m_logdev_targp == mp->m_ddev_targp) {
+ err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ if (err2 && !error)
+ error = err2;
+ }
return error;
}
@@ -225,7 +232,7 @@ xfs_file_dio_read(
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (ret)
return ret;
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -310,7 +317,7 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- int *iolock)
+ unsigned int *iolock)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -410,7 +417,7 @@ restart:
spin_unlock(&ip->i_flags_lock);
out:
- return file_modified(file);
+ return kiocb_modified(iocb);
}
static int
@@ -513,7 +520,7 @@ xfs_file_dio_write_aligned(
struct kiocb *iocb,
struct iov_iter *from)
{
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
ret = xfs_ilock_iocb(iocb, iolock);
@@ -534,7 +541,7 @@ xfs_file_dio_write_aligned(
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, 0);
+ &xfs_dio_write_ops, 0, NULL, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -566,7 +573,7 @@ xfs_file_dio_write_unaligned(
{
size_t isize = i_size_read(VFS_I(ip));
size_t count = iov_iter_count(from);
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
ssize_t ret;
@@ -576,9 +583,9 @@ xfs_file_dio_write_unaligned(
* don't even bother trying the fast path in this case.
*/
if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
-retry_exclusive:
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
+retry_exclusive:
iolock = XFS_IOLOCK_EXCL;
flags = IOMAP_DIO_FORCE_WAIT;
}
@@ -612,7 +619,7 @@ retry_exclusive:
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, flags, 0);
+ &xfs_dio_write_ops, flags, NULL, 0);
/*
* Retry unaligned I/O with exclusive blocking semantics if the DIO
@@ -655,7 +662,7 @@ xfs_file_dax_write(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- int iolock = XFS_IOLOCK_EXCL;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
loff_t pos;
@@ -669,7 +676,7 @@ xfs_file_dax_write(
pos = iocb->ki_pos;
trace_xfs_file_dax_write(iocb, from);
- ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
@@ -694,20 +701,17 @@ xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
bool cleared_space = false;
- int iolock;
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EOPNOTSUPP;
+ unsigned int iolock;
write_retry:
iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
@@ -767,9 +771,7 @@ xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
size_t ocount = iov_iter_count(from);
@@ -811,7 +813,7 @@ xfs_wait_dax_page(
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
-static int
+int
xfs_break_dax_layouts(
struct inode *inode,
bool *retry)
@@ -1167,12 +1169,10 @@ xfs_file_open(
struct inode *inode,
struct file *file)
{
- if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
- return 0;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
+ return generic_file_open(inode, file);
}
STATIC int
@@ -1181,7 +1181,7 @@ xfs_dir_open(
struct file *file)
{
struct xfs_inode *ip = XFS_I(inode);
- int mode;
+ unsigned int mode;
int error;
error = xfs_file_open(inode, file);
@@ -1260,6 +1260,31 @@ xfs_file_llseek(
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
+#ifdef CONFIG_FS_DAX
+static int
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_dax_write_iomap_ops :
+ &xfs_read_iomap_ops);
+}
+#else
+static int
+xfs_dax_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault,
+ pfn_t *pfn)
+{
+ return 0;
+}
+#endif
+
/*
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
@@ -1291,10 +1316,7 @@ __xfs_filemap_fault(
pfn_t pfn;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
- (write_fault && !vmf->cow_page) ?
- &xfs_direct_write_iomap_ops :
- &xfs_read_iomap_ops);
+ ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6a3ce0f6dc9e..34b21a29c39b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -126,13 +126,14 @@ xfs_filestream_pick_ag(
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
- err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+ err = xfs_alloc_read_agf(pag, NULL, trylock, NULL);
if (err) {
- xfs_perag_put(pag);
- if (err != -EAGAIN)
+ if (err != -EAGAIN) {
+ xfs_perag_put(pag);
return err;
+ }
/* Couldn't lock the AGF, skip this AG. */
- continue;
+ goto next_ag;
}
}
@@ -180,7 +181,7 @@ next_ag:
if (ag != startag)
continue;
- /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+ /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */
if (trylock != 0) {
trylock = 0;
continue;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 10e1cb71439e..d8337274c74d 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -450,11 +450,11 @@ xfs_getfsmap_logdev(
/* Transform a rtbitmap "record" into a fsmap */
STATIC int
xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
- struct xfs_mount *mp = tp->t_mountp;
struct xfs_getfsmap_info *info = priv;
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
@@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
ahigh.ar_startext++;
- error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
goto err;
@@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
info->last = true;
ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
- error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
if (error)
goto err;
err:
@@ -642,8 +642,7 @@ __xfs_getfsmap_datadev(
info->agf_bp = NULL;
}
- error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0,
- &info->agf_bp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp);
if (error)
break;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 68f74549fa22..13851c0d640b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -41,6 +41,7 @@ xfs_resizefs_init_new_ags(
xfs_agnumber_t oagcount,
xfs_agnumber_t nagcount,
xfs_rfsblock_t delta,
+ struct xfs_perag *last_pag,
bool *lastag_extended)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -73,7 +74,7 @@ xfs_resizefs_init_new_ags(
if (delta) {
*lastag_extended = true;
- error = xfs_ag_extend_space(mp, tp, id, delta);
+ error = xfs_ag_extend_space(last_pag, tp, delta);
}
return error;
}
@@ -96,6 +97,7 @@ xfs_growfs_data_private(
xfs_agnumber_t oagcount;
struct xfs_trans *tp;
struct aghdr_init_data id = {};
+ struct xfs_perag *last_pag;
nb = in->newblocks;
error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
@@ -128,10 +130,9 @@ xfs_growfs_data_private(
return -EINVAL;
oagcount = mp->m_sb.sb_agcount;
-
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
- error = xfs_initialize_perag(mp, nagcount, &nagimax);
+ error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
if (error)
return error;
} else if (nagcount < oagcount) {
@@ -145,20 +146,17 @@ xfs_growfs_data_private(
if (error)
return error;
+ last_pag = xfs_perag_get(mp, oagcount - 1);
if (delta > 0) {
error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
- delta, &lastag_extended);
+ delta, last_pag, &lastag_extended);
} else {
- static struct ratelimit_state shrink_warning = \
- RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1);
- ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE);
-
- if (__ratelimit(&shrink_warning))
- xfs_alert(mp,
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
- error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta);
+ error = xfs_ag_shrink_space(last_pag, &tp, -delta);
}
+ xfs_perag_put(last_pag);
if (error)
goto out_trans_cancel;
@@ -349,10 +347,7 @@ xfs_fs_counts(
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp);
-
- spin_lock(&mp->m_sb_lock);
- cnt->freertx = mp->m_sb.sb_frextents;
- spin_unlock(&mp->m_sb_lock);
+ cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
}
/*
@@ -512,7 +507,7 @@ xfs_fs_goingdown(
void
xfs_do_force_shutdown(
struct xfs_mount *mp,
- int flags,
+ uint32_t flags,
char *fname,
int lnnum)
{
@@ -536,6 +531,9 @@ xfs_do_force_shutdown(
} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
tag = XFS_PTAG_SHUTDOWN_CORRUPT;
why = "Corruption of in-memory data";
+ } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
+ tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+ why = "Corruption of on-disk metadata";
} else {
tag = XFS_PTAG_SHUTDOWN_IOERROR;
why = "Metadata I/O Error";
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f62fa652c2fd..4d0a98f920ca 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = {
#endif
#ifdef DEBUG
.pwork_threads = -1, /* automatic thread detection */
+ .larp = false, /* log attribute replay */
#endif
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bffd6eb0b298..2bbe7916a998 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -98,8 +98,9 @@ xfs_inode_alloc(
ip->i_ino = ino;
ip->i_mount = mp;
memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
- ip->i_afp = NULL;
ip->i_cowfp = NULL;
+ memset(&ip->i_af, 0, sizeof(ip->i_af));
+ ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
memset(&ip->i_df, 0, sizeof(ip->i_df));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
@@ -111,6 +112,8 @@ xfs_inode_alloc(
INIT_WORK(&ip->i_ioend_work, xfs_end_io);
INIT_LIST_HEAD(&ip->i_ioend_list);
spin_lock_init(&ip->i_ioend_lock);
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = NULLAGINO;
return ip;
}
@@ -130,10 +133,8 @@ xfs_inode_free_callback(
break;
}
- if (ip->i_afp) {
- xfs_idestroy_fork(ip->i_afp);
- kmem_cache_free(xfs_ifork_cache, ip->i_afp);
- }
+ xfs_ifork_zap_attr(ip);
+
if (ip->i_cowfp) {
xfs_idestroy_fork(ip->i_cowfp);
kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
@@ -440,7 +441,7 @@ xfs_inodegc_queue_all(
for_each_online_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
}
}
@@ -912,6 +913,7 @@ reclaim:
ip->i_checked = 0;
spin_unlock(&ip->i_flags_lock);
+ ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1774,7 +1776,7 @@ xfs_check_delalloc(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
@@ -1841,8 +1843,8 @@ void
xfs_inodegc_worker(
struct work_struct *work)
{
- struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc,
- work);
+ struct xfs_inodegc *gc = container_of(to_delayed_work(work),
+ struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
@@ -1862,19 +1864,29 @@ xfs_inodegc_worker(
}
/*
- * Force all currently queued inode inactivation work to run immediately and
- * wait for the work to finish.
+ * Expedite all pending inodegc work to run immediately. This does not wait for
+ * completion of the work.
*/
void
-xfs_inodegc_flush(
+xfs_inodegc_push(
struct xfs_mount *mp)
{
if (!xfs_is_inodegc_enabled(mp))
return;
+ trace_xfs_inodegc_push(mp, __return_address);
+ xfs_inodegc_queue_all(mp);
+}
+/*
+ * Force all currently queued inode inactivation work to run immediately and
+ * wait for the work to finish.
+ */
+void
+xfs_inodegc_flush(
+ struct xfs_mount *mp)
+{
+ xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
-
- xfs_inodegc_queue_all(mp);
flush_workqueue(mp->m_inodegc_wq);
}
@@ -1916,13 +1928,16 @@ xfs_inodegc_want_queue_rt_file(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- uint64_t freertx;
if (!XFS_IS_REALTIME_INODE(ip))
return false;
- freertx = READ_ONCE(mp->m_sb.sb_frextents);
- return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
+ if (__percpu_counter_compare(&mp->m_frextents,
+ mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+ XFS_FDBLOCKS_BATCH) < 0)
+ return true;
+
+ return false;
}
#else
# define xfs_inodegc_want_queue_rt_file(ip) (false)
@@ -2011,6 +2026,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
+ unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
spin_lock(&ip->i_flags_lock);
@@ -2022,19 +2038,26 @@ xfs_inodegc_queue(
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
- put_cpu_ptr(gc);
- if (!xfs_is_inodegc_enabled(mp))
+ /*
+ * We queue the work while holding the current CPU so that the work
+ * is scheduled to run on this CPU.
+ */
+ if (!xfs_is_inodegc_enabled(mp)) {
+ put_cpu_ptr(gc);
return;
-
- if (xfs_inodegc_want_queue_work(ip, items)) {
- trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
}
+ if (xfs_inodegc_want_queue_work(ip, items))
+ queue_delay = 0;
+
+ trace_xfs_inodegc_queue(mp, __return_address);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
+ put_cpu_ptr(gc);
+
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
- flush_work(&gc->work);
+ flush_delayed_work(&gc->work);
}
}
@@ -2051,7 +2074,7 @@ xfs_inodegc_cpu_dead(
unsigned int count = 0;
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
- cancel_work_sync(&dead_gc->work);
+ cancel_delayed_work_sync(&dead_gc->work);
if (llist_empty(&dead_gc->list))
return;
@@ -2070,12 +2093,12 @@ xfs_inodegc_cpu_dead(
llist_add_batch(first, last, &gc->list);
count += READ_ONCE(gc->items);
WRITE_ONCE(gc->items, count);
- put_cpu_ptr(gc);
if (xfs_is_inodegc_enabled(mp)) {
trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
}
+ put_cpu_ptr(gc);
}
/*
@@ -2170,7 +2193,7 @@ xfs_inodegc_shrinker_scan(
unsigned int h = READ_ONCE(gc->shrinker_hits);
WRITE_ONCE(gc->shrinker_hits, h + 1);
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
no_items = false;
}
}
@@ -2198,5 +2221,5 @@ xfs_inodegc_register_shrinker(
shrink->flags = SHRINKER_NONSLAB;
shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
- return register_shrinker(shrink);
+ return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 2e4cfddf8b8e..6cd180721659 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_push(struct xfs_mount *mp);
void xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 508e184e3b8f..b05314d48176 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9de6205fe134..28493c8e9bb2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -20,6 +20,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
+#include "xfs_iunlink_item.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -125,13 +126,33 @@ xfs_ilock_attr_map_shared(
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_afp && xfs_need_iread_extents(ip->i_afp))
+ if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
return lock_mode;
}
/*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
+ * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
+ * to set in lock_flags.
+ */
+static inline void
+xfs_lock_flags_assert(
+ uint lock_flags)
+{
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+}
+
+/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
* multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
@@ -168,18 +189,7 @@ xfs_ilock(
{
trace_xfs_ilock(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
down_write_nested(&VFS_I(ip)->i_rwsem,
@@ -222,18 +232,7 @@ xfs_ilock_nowait(
{
trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
@@ -291,19 +290,7 @@ xfs_iunlock(
xfs_inode_t *ip,
uint lock_flags)
{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
- ASSERT(lock_flags != 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -379,8 +366,8 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
- (lock_flags & XFS_IOLOCK_SHARED));
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
+ (lock_flags & XFS_MMAPLOCK_SHARED));
}
if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
@@ -416,10 +403,12 @@ xfs_lockdep_subclass_ok(
* parent locking. Care must be taken to ensure we don't overrun the subclass
* storage fields in the class mask we build.
*/
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
+static inline uint
+xfs_lock_inumorder(
+ uint lock_mode,
+ uint subclass)
{
- int class = 0;
+ uint class = 0;
ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
XFS_ILOCK_RTSUM)));
@@ -464,7 +453,10 @@ xfs_lock_inodes(
int inodes,
uint lock_mode)
{
- int attempts = 0, i, j, try_lock;
+ int attempts = 0;
+ uint i;
+ int j;
+ bool try_lock;
struct xfs_log_item *lp;
/*
@@ -489,9 +481,9 @@ xfs_lock_inodes(
} else if (lock_mode & XFS_MMAPLOCK_EXCL)
ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
- try_lock = 0;
- i = 0;
again:
+ try_lock = false;
+ i = 0;
for (; i < inodes; i++) {
ASSERT(ips[i]);
@@ -506,7 +498,7 @@ again:
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = &ips[j]->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
- try_lock++;
+ try_lock = true;
}
}
@@ -546,8 +538,6 @@ again:
if ((attempts % 5) == 0) {
delay(1); /* Don't just spin the CPU */
}
- i = 0;
- try_lock = 0;
goto again;
}
}
@@ -646,7 +636,7 @@ xfs_ip2xflags(
flags |= FS_XFLAG_COWEXTSIZE;
}
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
flags |= FS_XFLAG_HASATTR;
return flags;
}
@@ -904,7 +894,7 @@ xfs_init_new_inode(
*/
if (init_xattrs && xfs_has_attr(mp)) {
ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
- ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0);
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
}
/*
@@ -1024,11 +1014,6 @@ xfs_create(
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* A newly created regular or special file just has one directory
* entry pointing to them, but a directory also the "." entry
@@ -1242,11 +1227,6 @@ xfs_link(
if (error)
goto std_return;
- error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto error_return;
-
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
@@ -1314,8 +1294,8 @@ xfs_itruncate_clear_reflink_flags(
if (!xfs_is_reflink_inode(ip))
return;
- dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
if (cfork->if_bytes == 0)
@@ -1664,7 +1644,7 @@ xfs_inode_needs_inactive(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
/*
* If the inode is already free, then there can be nothing
@@ -1783,13 +1763,12 @@ xfs_inactive(
* now. The code calls a routine that recursively deconstructs the
* attribute fork. If also blows away the in-core attribute fork.
*/
- if (XFS_IFORK_Q(ip)) {
+ if (xfs_inode_has_attr_fork(ip)) {
error = xfs_attr_inactive(ip);
if (error)
goto out;
}
- ASSERT(!ip->i_afp);
ASSERT(ip->i_forkoff == 0);
/*
@@ -1822,195 +1801,69 @@ out:
* because we must walk that list to find the inode that points to the inode
* being removed from the unlinked hash bucket list.
*
- * What if we modelled the unlinked list as a collection of records capturing
- * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
- * have a fast way to look up unlinked list predecessors, which avoids the
- * slow list walk. That's exactly what we do here (in-core) with a per-AG
- * rhashtable.
- *
- * Because this is a backref cache, we ignore operational failures since the
- * iunlink code can fall back to the slow bucket walk. The only errors that
- * should bubble out are for obviously incorrect situations.
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
*
- * All users of the backref cache MUST hold the AGI buffer lock to serialize
- * access or have otherwise provided for concurrency control.
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
*/
-/* Capture a "X.next_unlinked = Y" relationship. */
-struct xfs_iunlink {
- struct rhash_head iu_rhash_head;
- xfs_agino_t iu_agino; /* X */
- xfs_agino_t iu_next_unlinked; /* Y */
-};
-
-/* Unlinked list predecessor lookup hashtable construction */
-static int
-xfs_iunlink_obj_cmpfn(
- struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const xfs_agino_t *key = arg->key;
- const struct xfs_iunlink *iu = obj;
-
- if (iu->iu_next_unlinked != *key)
- return 1;
- return 0;
-}
-
-static const struct rhashtable_params xfs_iunlink_hash_params = {
- .min_size = XFS_AGI_UNLINKED_BUCKETS,
- .key_len = sizeof(xfs_agino_t),
- .key_offset = offsetof(struct xfs_iunlink,
- iu_next_unlinked),
- .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
- .automatic_shrinking = true,
- .obj_cmpfn = xfs_iunlink_obj_cmpfn,
-};
-
/*
- * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
- * relation is found.
+ * Find an inode on the unlinked list. This does not take references to the
+ * inode as we have existence guarantees by holding the AGI buffer lock and that
+ * only unlinked, referenced inodes can be on the unlinked inode list. If we
+ * don't find the inode in cache, then let the caller handle the situation.
*/
-static xfs_agino_t
-xfs_iunlink_lookup_backref(
+static struct xfs_inode *
+xfs_iunlink_lookup(
struct xfs_perag *pag,
xfs_agino_t agino)
{
- struct xfs_iunlink *iu;
-
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- return iu ? iu->iu_agino : NULLAGINO;
-}
+ struct xfs_inode *ip;
-/*
- * Take ownership of an iunlink cache entry and insert it into the hash table.
- * If successful, the entry will be owned by the cache; if not, it is freed.
- * Either way, the caller does not own @iu after this call.
- */
-static int
-xfs_iunlink_insert_backref(
- struct xfs_perag *pag,
- struct xfs_iunlink *iu)
-{
- int error;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
- error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
/*
- * Fail loudly if there already was an entry because that's a sign of
- * corruption of in-memory data. Also fail loudly if we see an error
- * code we didn't anticipate from the rhashtable code. Currently we
- * only anticipate ENOMEM.
+ * Inode not in memory or in RCU freeing limbo should not happen.
+ * Warn about this and let the caller handle the failure.
*/
- if (error) {
- WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
- kmem_free(iu);
+ if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ rcu_read_unlock();
+ return NULL;
}
- /*
- * Absorb any runtime errors that aren't a result of corruption because
- * this is a cache and we can always fall back to bucket list scanning.
- */
- if (error != 0 && error != -EEXIST)
- error = 0;
- return error;
+ ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM));
+ rcu_read_unlock();
+ return ip;
}
-/* Remember that @prev_agino.next_unlinked = @this_agino. */
+/* Update the prev pointer of the next agino. */
static int
-xfs_iunlink_add_backref(
+xfs_iunlink_update_backref(
struct xfs_perag *pag,
xfs_agino_t prev_agino,
- xfs_agino_t this_agino)
-{
- struct xfs_iunlink *iu;
-
- if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
- return 0;
-
- iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
- iu->iu_agino = prev_agino;
- iu->iu_next_unlinked = this_agino;
-
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/*
- * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
- * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
- * wasn't any such entry then we don't bother.
- */
-static int
-xfs_iunlink_change_backref(
- struct xfs_perag *pag,
- xfs_agino_t agino,
- xfs_agino_t next_unlinked)
+ xfs_agino_t next_agino)
{
- struct xfs_iunlink *iu;
- int error;
-
- /* Look up the old entry; if there wasn't one then exit. */
- iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
- xfs_iunlink_hash_params);
- if (!iu)
- return 0;
-
- /*
- * Remove the entry. This shouldn't ever return an error, but if we
- * couldn't remove the old entry we don't want to add it again to the
- * hash table, and if the entry disappeared on us then someone's
- * violated the locking rules and we need to fail loudly. Either way
- * we cannot remove the inode because internal state is or would have
- * been corrupt.
- */
- error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
- &iu->iu_rhash_head, xfs_iunlink_hash_params);
- if (error)
- return error;
+ struct xfs_inode *ip;
- /* If there is no new next entry just free our item and return. */
- if (next_unlinked == NULLAGINO) {
- kmem_free(iu);
+ /* No update necessary if we are at the end of the list. */
+ if (next_agino == NULLAGINO)
return 0;
- }
-
- /* Update the entry and re-add it to the hash table. */
- iu->iu_next_unlinked = next_unlinked;
- return xfs_iunlink_insert_backref(pag, iu);
-}
-
-/* Set up the in-core predecessor structures. */
-int
-xfs_iunlink_init(
- struct xfs_perag *pag)
-{
- return rhashtable_init(&pag->pagi_unlinked_hash,
- &xfs_iunlink_hash_params);
-}
-
-/* Free the in-core predecessor structures. */
-static void
-xfs_iunlink_free_item(
- void *ptr,
- void *arg)
-{
- struct xfs_iunlink *iu = ptr;
- bool *freed_anything = arg;
- *freed_anything = true;
- kmem_free(iu);
-}
-
-void
-xfs_iunlink_destroy(
- struct xfs_perag *pag)
-{
- bool freed_anything = false;
-
- rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
- xfs_iunlink_free_item, &freed_anything);
-
- ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount));
+ ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!ip)
+ return -EFSCORRUPTED;
+ ip->i_prev_unlinked = prev_agino;
+ return 0;
}
/*
@@ -2029,7 +1882,7 @@ xfs_iunlink_update_bucket(
xfs_agino_t old_value;
int offset;
- ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino));
+ ASSERT(xfs_verify_agino_or_null(pag, new_agino));
old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
@@ -2052,88 +1905,53 @@ xfs_iunlink_update_bucket(
return 0;
}
-/* Set an on-disk inode's next_unlinked pointer. */
-STATIC void
-xfs_iunlink_update_dinode(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_agino_t agino,
- struct xfs_buf *ibp,
- struct xfs_dinode *dip,
- struct xfs_imap *imap,
- xfs_agino_t next_agino)
-{
- struct xfs_mount *mp = tp->t_mountp;
- int offset;
-
- ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
-
- trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino,
- be32_to_cpu(dip->di_next_unlinked), next_agino);
-
- dip->di_next_unlinked = cpu_to_be32(next_agino);
- offset = imap->im_boffset +
- offsetof(struct xfs_dinode, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
-}
-
-/* Set an in-core inode's unlinked pointer and return the old value. */
-STATIC int
-xfs_iunlink_update_inode(
+static int
+xfs_iunlink_insert_inode(
struct xfs_trans *tp,
- struct xfs_inode *ip,
struct xfs_perag *pag,
- xfs_agino_t next_agino,
- xfs_agino_t *old_next_agino)
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_dinode *dip;
- struct xfs_buf *ibp;
- xfs_agino_t old_value;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t next_agino;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
- ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino));
-
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
- if (error)
- return error;
- dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
-
- /* Make sure the old pointer isn't garbage. */
- old_value = be32_to_cpu(dip->di_next_unlinked);
- if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
- sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- goto out;
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
+ */
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(pag, next_agino)) {
+ xfs_buf_mark_corrupt(agibp);
+ return -EFSCORRUPTED;
}
/*
- * Since we're updating a linked list, we should never find that the
- * current pointer is the same as the new value, unless we're
- * terminating the list.
+ * Update the prev pointer in the next inode to point back to this
+ * inode.
*/
- *old_next_agino = old_value;
- if (old_value == next_agino) {
- if (next_agino != NULLAGINO) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
- dip, sizeof(*dip), __this_address);
- error = -EFSCORRUPTED;
- }
- goto out;
+ error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error)
+ return error;
+
+ if (next_agino != NULLAGINO) {
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+ if (error)
+ return error;
+ ip->i_next_unlinked = next_agino;
}
- /* Ok, update the new pointer. */
- xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
- ibp, dip, &ip->i_imap, next_agino);
- return 0;
-out:
- xfs_trans_brelse(tp, ibp);
- return error;
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
/*
@@ -2150,11 +1968,7 @@ xfs_iunlink(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_perag *pag;
- struct xfs_agi *agi;
struct xfs_buf *agibp;
- xfs_agino_t next_agino;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
ASSERT(VFS_I(ip)->i_nlink == 0);
@@ -2164,202 +1978,38 @@ xfs_iunlink(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
goto out;
- agi = agibp->b_addr;
-
- /*
- * Get the index into the agi hash table for the list this inode will
- * go on. Make sure the pointer isn't garbage and that this inode
- * isn't already on the list.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (next_agino == agino ||
- !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) {
- xfs_buf_mark_corrupt(agibp);
- error = -EFSCORRUPTED;
- goto out;
- }
-
- if (next_agino != NULLAGINO) {
- xfs_agino_t old_agino;
-
- /*
- * There is already another inode in the bucket, so point this
- * inode to the current head of the list.
- */
- error = xfs_iunlink_update_inode(tp, ip, pag, next_agino,
- &old_agino);
- if (error)
- goto out;
- ASSERT(old_agino == NULLAGINO);
- /*
- * agino has been unlinked, add a backref from the next inode
- * back to agino.
- */
- error = xfs_iunlink_add_backref(pag, agino, next_agino);
- if (error)
- goto out;
- }
-
- /* Point the head of the list to point to this inode. */
- error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+ error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
out:
xfs_perag_put(pag);
return error;
}
-/* Return the imap, dinode pointer, and buffer for an inode. */
-STATIC int
-xfs_iunlink_map_ino(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = tp->t_mountp;
- int error;
-
- imap->im_blkno = 0;
- error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
-
- error = xfs_imap_to_bp(mp, tp, imap, bpp);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
-
- *dipp = xfs_buf_offset(*bpp, imap->im_boffset);
- return 0;
-}
-
-/*
- * Walk the unlinked chain from @head_agino until we find the inode that
- * points to @target_agino. Return the inode number, map, dinode pointer,
- * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
- *
- * @tp, @pag, @head_agino, and @target_agino are input parameters.
- * @agino, @imap, @dipp, and @bpp are all output parameters.
- *
- * Do not call this function if @target_agino is the head of the list.
- */
-STATIC int
-xfs_iunlink_map_prev(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- xfs_agino_t head_agino,
- xfs_agino_t target_agino,
- xfs_agino_t *agino,
- struct xfs_imap *imap,
- struct xfs_dinode **dipp,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agino_t next_agino;
- int error;
-
- ASSERT(head_agino != target_agino);
- *bpp = NULL;
-
- /* See if our backref cache can find it faster. */
- *agino = xfs_iunlink_lookup_backref(pag, target_agino);
- if (*agino != NULLAGINO) {
- error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap,
- dipp, bpp);
- if (error)
- return error;
-
- if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
- return 0;
-
- /*
- * If we get here the cache contents were corrupt, so drop the
- * buffer and fall back to walking the bucket list.
- */
- xfs_trans_brelse(tp, *bpp);
- *bpp = NULL;
- WARN_ON_ONCE(1);
- }
-
- trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno);
-
- /* Otherwise, walk the entire bucket until we find it. */
- next_agino = head_agino;
- while (next_agino != target_agino) {
- xfs_agino_t unlinked_agino;
-
- if (*bpp)
- xfs_trans_brelse(tp, *bpp);
-
- *agino = next_agino;
- error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap,
- dipp, bpp);
- if (error)
- return error;
-
- unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
- /*
- * Make sure this pointer is valid and isn't an obvious
- * infinite loop.
- */
- if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) ||
- next_agino == unlinked_agino) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- *dipp, sizeof(**dipp));
- error = -EFSCORRUPTED;
- return error;
- }
- next_agino = unlinked_agino;
- }
-
- return 0;
-}
-
-/*
- * Pull the on-disk inode from the AGI unlinked list.
- */
-STATIC int
-xfs_iunlink_remove(
+static int
+xfs_iunlink_remove_inode(
struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_buf *agibp,
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi;
- struct xfs_buf *agibp;
- struct xfs_buf *last_ibp;
- struct xfs_dinode *last_dip = NULL;
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- xfs_agino_t next_agino;
xfs_agino_t head_agino;
short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
int error;
trace_xfs_iunlink_remove(ip);
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp);
- if (error)
- return error;
- agi = agibp->b_addr;
-
/*
* Get the index into the agi hash table for the list this inode will
* go on. Make sure the head pointer isn't garbage.
*/
head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) {
+ if (!xfs_verify_agino(pag, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
@@ -2370,52 +2020,60 @@ xfs_iunlink_remove(
* the old pointer value so that we can update whatever was previous
* to us in the list to point to whatever was next in the list.
*/
- error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino);
+ error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
if (error)
return error;
/*
- * If there was a backref pointing from the next inode back to this
- * one, remove it because we've removed this inode from the list.
- *
- * Later, if this inode was in the middle of the list we'll update
- * this inode's backref to point from the next inode.
+ * Update the prev pointer in the next inode to point back to previous
+ * inode in the chain.
*/
- if (next_agino != NULLAGINO) {
- error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO);
- if (error)
- return error;
- }
+ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
+ if (error)
+ return error;
if (head_agino != agino) {
- struct xfs_imap imap;
- xfs_agino_t prev_agino;
-
- /* We need to search the list for the inode being freed. */
- error = xfs_iunlink_map_prev(tp, pag, head_agino, agino,
- &prev_agino, &imap, &last_dip, &last_ibp);
- if (error)
- return error;
+ struct xfs_inode *prev_ip;
- /* Point the previous inode on the list to the next inode. */
- xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp,
- last_dip, &imap, next_agino);
+ prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+ if (!prev_ip)
+ return -EFSCORRUPTED;
- /*
- * Now we deal with the backref for this inode. If this inode
- * pointed at a real inode, change the backref that pointed to
- * us to point to our old next. If this inode was the end of
- * the list, delete the backref that pointed to us. Note that
- * change_backref takes care of deleting the backref if
- * next_agino is NULLAGINO.
- */
- return xfs_iunlink_change_backref(agibp->b_pag, agino,
- next_agino);
+ error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+ ip->i_next_unlinked);
+ prev_ip->i_next_unlinked = ip->i_next_unlinked;
+ } else {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+ ip->i_next_unlinked);
}
- /* Point the head of the list to the next unlinked inode. */
- return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
- next_agino);
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = NULLAGINO;
+ return error;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ struct xfs_buf *agibp;
+ int error;
+
+ trace_xfs_iunlink_remove(ip);
+
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(pag, tp, &agibp);
+ if (error)
+ return error;
+
+ return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
}
/*
@@ -2594,14 +2252,13 @@ xfs_ifree_cluster(
}
/*
- * This is called to return an inode to the inode free list.
- * The inode should already be truncated to 0 length and have
- * no pages associated with it. This routine also assumes that
- * the inode is already a part of the transaction.
+ * This is called to return an inode to the inode free list. The inode should
+ * already be truncated to 0 length and have no pages associated with it. This
+ * routine also assumes that the inode is already a part of the transaction.
*
- * The on-disk copy of the inode will have been added to the list
- * of unlinked inodes in the AGI. We need to remove the inode from
- * that list atomically with respect to freeing it here.
+ * The on-disk copy of the inode will have been added to the list of unlinked
+ * inodes in the AGI. We need to remove the inode from that list atomically with
+ * respect to freeing it here.
*/
int
xfs_ifree(
@@ -2623,13 +2280,16 @@ xfs_ifree(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
/*
- * Pull the on-disk inode from the AGI unlinked list.
+ * Free the inode first so that we guarantee that the AGI lock is going
+ * to be taken before we remove the inode from the unlinked list. This
+ * makes the AGI lock -> unlinked list modification order the same as
+ * used in O_TMPFILE creation.
*/
- error = xfs_iunlink_remove(tp, pag, ip);
+ error = xfs_difree(tp, pag, ip->i_ino, &xic);
if (error)
goto out;
- error = xfs_difree(tp, pag, ip->i_ino, &xic);
+ error = xfs_iunlink_remove(tp, pag, ip);
if (error)
goto out;
@@ -3051,10 +2711,12 @@ out_trans_abort:
static int
xfs_rename_alloc_whiteout(
struct user_namespace *mnt_userns,
+ struct xfs_name *src_name,
struct xfs_inode *dp,
struct xfs_inode **wip)
{
struct xfs_inode *tmpfile;
+ struct qstr name;
int error;
error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
@@ -3062,6 +2724,15 @@ xfs_rename_alloc_whiteout(
if (error)
return error;
+ name.name = src_name->name;
+ name.len = src_name->len;
+ error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name);
+ if (error) {
+ xfs_finish_inode_setup(tmpfile);
+ xfs_irele(tmpfile);
+ return error;
+ }
+
/*
* Prepare the tmpfile inode as if it were created through the VFS.
* Complete the inode setup and flag it as linkable. nlink is already
@@ -3112,7 +2783,8 @@ xfs_rename(
* appropriately.
*/
if (flags & RENAME_WHITEOUT) {
- error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
+ error = xfs_rename_alloc_whiteout(mnt_userns, src_name,
+ target_dp, &wip);
if (error)
return error;
@@ -3210,35 +2882,6 @@ retry:
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
- *
- * Extent count overflow check:
- *
- * From the perspective of src_dp, a rename operation is essentially a
- * directory entry remove operation. Hence the only place where we check
- * for extent count overflow for src_dp is in
- * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
- * -ENOSPC when it detects a possible extent count overflow and in
- * response, the higher layers of directory handling code do the
- * following:
- * 1. Data/Free blocks: XFS lets these blocks linger until a
- * future remove operation removes them.
- * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
- * Leaf space and unmaps the last block.
- *
- * For target_dp, there are two cases depending on whether the
- * destination directory entry exists or not.
- *
- * When destination directory entry does not exist (i.e. target_ip ==
- * NULL), extent count overflow check is performed only when transaction
- * has a non-zero sized space reservation associated with it. With a
- * zero-sized space reservation, XFS allows a rename operation to
- * continue only when the directory has sufficient free space in its
- * data/leaf/free space blocks to hold the new entry.
- *
- * When destination directory entry exists (i.e. target_ip != NULL), all
- * we need to do is change the inode number associated with the already
- * existing entry. Hence there is no need to perform an extent count
- * overflow check.
*/
if (target_ip == NULL) {
/*
@@ -3249,12 +2892,6 @@ retry:
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
- } else {
- error = xfs_iext_count_may_overflow(target_dp,
- XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
}
} else {
/*
@@ -3283,11 +2920,13 @@ retry:
if (inodes[i] == wip ||
(inodes[i] == target_ip &&
(VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
- struct xfs_buf *bp;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag;
+ struct xfs_buf *bp;
- agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
- error = xfs_read_agi(mp, tp, agno, &bp);
+ pag = xfs_perag_get(mp,
+ XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
+ error = xfs_read_agi(pag, tp, &bp);
+ xfs_perag_put(pag);
if (error)
goto out_trans_cancel;
}
@@ -3422,18 +3061,12 @@ retry:
* inode number of the whiteout inode rather than removing it
* altogether.
*/
- if (wip) {
+ if (wip)
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else {
- /*
- * NOTE: We don't need to check for extent count overflow here
- * because the dir remove name code will leave the dir block in
- * place if the extent count would overflow.
- */
+ else
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
- }
if (error)
goto out_trans_cancel;
@@ -3512,13 +3145,13 @@ xfs_iflush(
goto flush_out;
}
}
- if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
+ if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: detected corrupt incore inode %Lu, "
- "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
+ "%s: detected corrupt incore inode %llu, "
+ "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
__func__, ip->i_ino,
- ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
+ ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
ip->i_nblocks, ip);
goto flush_out;
}
@@ -3548,7 +3181,8 @@ xfs_iflush(
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
xfs_ifork_verify_local_data(ip))
goto flush_out;
- if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
+ if (xfs_inode_has_attr_fork(ip) &&
+ ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
xfs_ifork_verify_local_attr(ip))
goto flush_out;
@@ -3566,7 +3200,7 @@ xfs_iflush(
}
xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
/*
@@ -3813,6 +3447,50 @@ retry:
return 0;
}
+static int
+xfs_mmaplock_two_inodes_and_break_dax_layout(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ int error;
+ bool retry;
+ struct page *page;
+
+ if (ip1->i_ino > ip2->i_ino)
+ swap(ip1, ip2);
+
+again:
+ retry = false;
+ /* Lock the first inode */
+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+ error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
+ if (error || retry) {
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ if (error == 0 && retry)
+ goto again;
+ return error;
+ }
+
+ if (ip1 == ip2)
+ return 0;
+
+ /* Nested lock the second inode */
+ xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1));
+ /*
+ * We cannot use xfs_break_dax_layouts() directly here because it may
+ * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
+ * for this nested lock case.
+ */
+ page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
+ if (page && page_ref_count(page) != 1) {
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ goto again;
+ }
+
+ return 0;
+}
+
/*
* Lock two inodes so that userspace cannot initiate I/O via file syscalls or
* mmap activity.
@@ -3827,8 +3505,19 @@ xfs_ilock2_io_mmap(
ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
if (ret)
return ret;
- filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
- VFS_I(ip2)->i_mapping);
+
+ if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+ ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
+ if (ret) {
+ inode_unlock(VFS_I(ip2));
+ if (ip1 != ip2)
+ inode_unlock(VFS_I(ip1));
+ return ret;
+ }
+ } else
+ filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
+
return 0;
}
@@ -3838,8 +3527,14 @@ xfs_iunlock2_io_mmap(
struct xfs_inode *ip1,
struct xfs_inode *ip2)
{
- filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
- VFS_I(ip2)->i_mapping);
+ if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ if (ip1 != ip2)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ } else
+ filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping,
+ VFS_I(ip2)->i_mapping);
+
inode_unlock(VFS_I(ip2));
if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 740ab13d1aa2..fa780f08dc89 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -33,9 +33,9 @@ typedef struct xfs_inode {
struct xfs_imap i_imap; /* location for xfs_imap() */
/* Extent information. */
- struct xfs_ifork *i_afp; /* attribute fork pointer */
struct xfs_ifork *i_cowfp; /* copy on write extents */
struct xfs_ifork i_df; /* data fork */
+ struct xfs_ifork i_af; /* attribute fork */
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
@@ -68,6 +68,10 @@ typedef struct xfs_inode {
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
+ /* unlinked list pointers */
+ xfs_agino_t i_next_unlinked;
+ xfs_agino_t i_prev_unlinked;
+
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
@@ -77,6 +81,66 @@ typedef struct xfs_inode {
struct list_head i_ioend_list;
} xfs_inode_t;
+static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
+{
+ return ip->i_forkoff > 0;
+}
+
+static inline struct xfs_ifork *
+xfs_ifork_ptr(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return &ip->i_df;
+ case XFS_ATTR_FORK:
+ if (!xfs_inode_has_attr_fork(ip))
+ return NULL;
+ return &ip->i_af;
+ case XFS_COW_FORK:
+ return ip->i_cowfp;
+ default:
+ ASSERT(0);
+ return NULL;
+ }
+}
+
+static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip)
+{
+ return ip->i_forkoff << 3;
+}
+
+static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip)
+{
+ if (xfs_inode_has_attr_fork(ip))
+ return xfs_inode_fork_boff(ip);
+
+ return XFS_LITINO(ip->i_mount);
+}
+
+static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip)
+{
+ if (xfs_inode_has_attr_fork(ip))
+ return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip);
+ return 0;
+}
+
+static inline unsigned int
+xfs_inode_fork_size(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_inode_data_fork_size(ip);
+ case XFS_ATTR_FORK:
+ return xfs_inode_attr_fork_size(ip);
+ default:
+ return 0;
+ }
+}
+
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
@@ -218,6 +282,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
}
+static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -278,12 +347,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
*/
-#define XFS_IOLOCK_EXCL (1<<0)
-#define XFS_IOLOCK_SHARED (1<<1)
-#define XFS_ILOCK_EXCL (1<<2)
-#define XFS_ILOCK_SHARED (1<<3)
-#define XFS_MMAPLOCK_EXCL (1<<4)
-#define XFS_MMAPLOCK_SHARED (1<<5)
+#define XFS_IOLOCK_EXCL (1u << 0)
+#define XFS_IOLOCK_SHARED (1u << 1)
+#define XFS_ILOCK_EXCL (1u << 2)
+#define XFS_ILOCK_SHARED (1u << 3)
+#define XFS_MMAPLOCK_EXCL (1u << 4)
+#define XFS_MMAPLOCK_SHARED (1u << 5)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
@@ -350,19 +419,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
*/
#define XFS_IOLOCK_SHIFT 16
#define XFS_IOLOCK_MAX_SUBCLASS 3
-#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_IOLOCK_DEP_MASK 0x000f0000u
#define XFS_MMAPLOCK_SHIFT 20
#define XFS_MMAPLOCK_NUMORDER 0
#define XFS_MMAPLOCK_MAX_SUBCLASS 3
-#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u
#define XFS_ILOCK_SHIFT 24
-#define XFS_ILOCK_PARENT_VAL 5
+#define XFS_ILOCK_PARENT_VAL 5u
#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL 6
-#define XFS_ILOCK_RTSUM_VAL 7
-#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_ILOCK_RTBITMAP_VAL 6u
+#define XFS_ILOCK_RTSUM_VAL 7u
+#define XFS_ILOCK_DEP_MASK 0xff000000u
#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
@@ -462,6 +531,7 @@ xfs_itruncate_extents(
}
/* from xfs_file.c */
+int xfs_break_dax_layouts(struct inode *inode, bool *retry);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
@@ -500,9 +570,6 @@ extern struct kmem_cache *xfs_inode_cache;
bool xfs_inode_needs_inactive(struct xfs_inode *ip);
-int xfs_iunlink_init(struct xfs_perag *pag);
-void xfs_iunlink_destroy(struct xfs_perag *pag);
-
void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9e6ef55cf29e..6e19ece916bf 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -57,7 +57,7 @@ xfs_inode_item_data_fork_size(
ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
/* worst case, doesn't subtract delalloc extents */
- *nbytes += XFS_IFORK_DSIZE(ip);
+ *nbytes += xfs_inode_data_fork_size(ip);
*nvecs += 1;
}
break;
@@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- *nbytes += roundup(ip->i_df.if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
*nvecs += 1;
}
break;
@@ -92,27 +92,27 @@ xfs_inode_item_attr_fork_size(
{
struct xfs_inode *ip = iip->ili_inode;
- switch (ip->i_afp->if_format) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_afp->if_nextents > 0 &&
- ip->i_afp->if_bytes > 0) {
+ ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_bytes > 0) {
/* worst case, doesn't subtract unused space */
- *nbytes += XFS_IFORK_ASIZE(ip);
+ *nbytes += xfs_inode_attr_fork_size(ip);
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
- ip->i_afp->if_broot_bytes > 0) {
- *nbytes += ip->i_afp->if_broot_bytes;
+ ip->i_af.if_broot_bytes > 0) {
+ *nbytes += ip->i_af.if_broot_bytes;
*nvecs += 1;
}
break;
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
- ip->i_afp->if_bytes > 0) {
- *nbytes += roundup(ip->i_afp->if_bytes, 4);
+ ip->i_af.if_bytes > 0) {
+ *nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes);
*nvecs += 1;
}
break;
@@ -143,7 +143,7 @@ xfs_inode_item_size(
xfs_log_dinode_size(ip->i_mount);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
- if (XFS_IFORK_Q(ip))
+ if (xfs_inode_has_attr_fork(ip))
xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
}
@@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data, data_bytes);
- ilf->ilf_dsize = (unsigned)data_bytes;
+ ip->i_df.if_u1.if_data,
+ ip->i_df.if_bytes);
+ ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
@@ -242,18 +237,18 @@ xfs_inode_item_format_attr_fork(
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
- switch (ip->i_afp->if_format) {
+ switch (ip->i_af.if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_afp->if_nextents > 0 &&
- ip->i_afp->if_bytes > 0) {
+ ip->i_af.if_nextents > 0 &&
+ ip->i_af.if_bytes > 0) {
struct xfs_bmbt_rec *p;
- ASSERT(xfs_iext_count(ip->i_afp) ==
- ip->i_afp->if_nextents);
+ ASSERT(xfs_iext_count(&ip->i_af) ==
+ ip->i_af.if_nextents);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
@@ -270,13 +265,13 @@ xfs_inode_item_format_attr_fork(
~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
- ip->i_afp->if_broot_bytes > 0) {
- ASSERT(ip->i_afp->if_broot != NULL);
+ ip->i_af.if_broot_bytes > 0) {
+ ASSERT(ip->i_af.if_broot != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
- ip->i_afp->if_broot,
- ip->i_afp->if_broot_bytes);
- ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+ ip->i_af.if_broot,
+ ip->i_af.if_broot_bytes);
+ ilf->ilf_asize = ip->i_af.if_broot_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ABROOT;
@@ -287,18 +282,12 @@ xfs_inode_item_format_attr_fork(
~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
- ip->i_afp->if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_afp->if_bytes, 4);
- ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ ip->i_af.if_bytes > 0) {
+ ASSERT(ip->i_af.if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
- ip->i_afp->if_u1.if_data,
- data_bytes);
- ilf->ilf_asize = (unsigned)data_bytes;
+ ip->i_af.if_u1.if_data,
+ ip->i_af.if_bytes);
+ ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -359,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode(
}
}
+static inline void
+xfs_inode_to_log_dinode_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_big_anextents = xfs_ifork_nextents(&ip->i_af);
+ to->di_nrext64_pad = 0;
+ } else {
+ to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_anextents = xfs_ifork_nextents(&ip->i_af);
+ }
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
@@ -374,7 +378,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
- memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
@@ -386,10 +389,8 @@ xfs_inode_to_log_dinode(
to->di_size = ip->i_disk_size;
to->di_nblocks = ip->i_nblocks;
to->di_extsize = ip->i_extsize;
- to->di_nextents = xfs_ifork_nextents(&ip->i_df);
- to->di_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_forkoff = ip->i_forkoff;
- to->di_aformat = xfs_ifork_format(ip->i_afp);
+ to->di_aformat = xfs_ifork_format(&ip->i_af);
to->di_flags = ip->i_diflags;
xfs_copy_dm_fields_to_log_dinode(ip, to);
@@ -407,11 +408,14 @@ xfs_inode_to_log_dinode(
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_log_dinode_iext_counters(ip, to);
}
/*
@@ -476,7 +480,7 @@ xfs_inode_item_format(
xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
- if (XFS_IFORK_Q(ip)) {
+ if (xfs_inode_has_attr_fork(ip)) {
xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
} else {
iip->ili_fields &=
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 239dd2e3384e..d28ffaebd067 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts(
return ts;
}
+static inline bool xfs_log_dinode_has_large_extent_counts(
+ const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
+}
+
+static inline void
+xfs_log_dinode_to_disk_iext_counters(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ if (xfs_log_dinode_has_large_extent_counts(from)) {
+ to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
+ to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
+ to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
+ } else {
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ }
+
+}
+
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
@@ -158,7 +181,6 @@ xfs_log_dinode_to_disk(
to->di_nlink = cpu_to_be32(from->di_nlink);
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
@@ -167,8 +189,6 @@ xfs_log_dinode_to_disk(
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
to->di_forkoff = from->di_forkoff;
to->di_aformat = from->di_aformat;
to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
@@ -184,12 +204,66 @@ xfs_log_dinode_to_disk(
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_log_dinode_to_disk_iext_counters(from, to);
+}
+
+STATIC int
+xlog_dinode_verify_extent_counts(
+ struct xfs_mount *mp,
+ struct xfs_log_dinode *ldip)
+{
+ xfs_extnum_t nextents;
+ xfs_aextnum_t anextents;
+
+ if (xfs_log_dinode_has_large_extent_counts(ldip)) {
+ if (!xfs_has_large_extent_counts(mp) ||
+ (ldip->di_nrext64_pad != 0)) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode large extent count format",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, padding 0x%x",
+ ldip->di_ino, xfs_has_large_extent_counts(mp),
+ ldip->di_nrext64_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_big_nextents;
+ anextents = ldip->di_big_anextents;
+ } else {
+ if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode di_v3_pad",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, di_v3_pad 0x%llx",
+ ldip->di_ino, ldip->di_v3_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_nextents;
+ anextents = ldip->di_anextents;
+ }
+
+ if (unlikely(nextents + anextents > ldip->di_nblocks)) {
+ XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
+ ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
+ anextents, ldip->di_nblocks);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
STATIC int
@@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2(
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for regular file",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2(
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE) &&
(ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for directory",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
}
- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
- __func__, item, dip, bp, in_f->ilf_ino,
- ldip->di_nextents + ldip->di_anextents,
- ldip->di_nblocks);
- error = -EFSCORRUPTED;
+
+ error = xlog_dinode_verify_extent_counts(mp, ldip);
+ if (error)
goto out_release;
- }
+
if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
+ "Bad inode 0x%llx, di_forkoff 0x%x",
+ in_f->ilf_ino, ldip->di_forkoff);
error = -EFSCORRUPTED;
goto out_release;
}
isize = xfs_log_dinode_size(mp);
if (unlikely(item->ri_buf[1].i_len > isize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
+ mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
- __func__, item->ri_buf[1].i_len, item);
+ "Bad inode 0x%llx log dinode size 0x%x",
+ in_f->ilf_ino, item->ri_buf[1].i_len);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2(
ASSERT(in_f->ilf_size <= 4);
ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
ASSERT(!(fields & XFS_ILOG_DFORK) ||
- (len == in_f->ilf_dsize));
+ (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
switch (fields & XFS_ILOG_DFORK) {
case XFS_ILOG_DDATA:
@@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2(
}
len = item->ri_buf[attr_index].i_len;
src = item->ri_buf[attr_index].i_addr;
- ASSERT(len == in_f->ilf_asize);
+ ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
case XFS_ILOG_ADATA:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 83481005317a..1f783e979629 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -15,6 +15,8 @@
#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -35,8 +37,7 @@
#include "xfs_health.h"
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
+#include "xfs_xattr.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -524,7 +525,7 @@ xfs_attrmulti_attr_set(
args.valuelen = len;
}
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
if (!error && (flags & XFS_IOC_ATTR_ROOT))
xfs_forget_acl(inode, name);
kfree(args.value);
@@ -813,6 +814,9 @@ xfs_bulk_ireq_setup(
if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
return -ECANCELED;
+ if (hdr->flags & XFS_BULK_IREQ_NREXT64)
+ breq->flags |= XFS_IBULK_NREXT64;
+
return 0;
}
@@ -951,6 +955,7 @@ xfs_ioc_ag_geometry(
struct xfs_mount *mp,
void __user *arg)
{
+ struct xfs_perag *pag;
struct xfs_ag_geometry ageo;
int error;
@@ -961,7 +966,12 @@ xfs_ioc_ag_geometry(
if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
return -EINVAL;
- error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
+ pag = xfs_perag_get(mp, ageo.ag_number);
+ if (!pag)
+ return -EINVAL;
+
+ error = xfs_ag_get_geometry(pag, &ageo);
+ xfs_perag_put(pag);
if (error)
return error;
@@ -981,7 +991,7 @@ xfs_fill_fsxattr(
struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
fileattr_fill_xflags(fa, xfs_ip2xflags(ip));
@@ -1092,7 +1102,8 @@ xfs_flags2diflags2(
{
uint64_t di_flags2 =
(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
- XFS_DIFLAG2_BIGTIME));
+ XFS_DIFLAG2_BIGTIME |
+ XFS_DIFLAG2_NREXT64));
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ca25ed89b706..2f54b701eead 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -17,6 +17,8 @@
#include "xfs_itable.h"
#include "xfs_fsops.h"
#include "xfs_rtalloc.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_ioctl.h"
#include "xfs_ioctl32.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e552ce541ec2..07da03976ec1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -159,7 +159,7 @@ xfs_iomap_eof_align_last_fsb(
struct xfs_inode *ip,
xfs_fileoff_t end_fsb)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
xfs_extlen_t align = xfs_eof_alignment(ip);
struct xfs_bmbt_irec irec;
@@ -251,6 +251,8 @@ xfs_iomap_write_direct(
return error;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, nr_exts);
if (error)
goto out_trans_cancel;
@@ -368,7 +370,7 @@ xfs_iomap_prealloc_size(
struct xfs_iext_cursor ncur = *icur;
struct xfs_bmbt_irec prev, got;
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
int64_t freesp;
xfs_fsblock_t qblocks;
@@ -402,7 +404,7 @@ xfs_iomap_prealloc_size(
*/
plen = prev.br_blockcount;
while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
- if (plen > MAXEXTLEN / 2 ||
+ if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
isnullstartblock(got.br_startblock) ||
got.br_startoff + got.br_blockcount != prev.br_startoff ||
got.br_startblock + got.br_blockcount != prev.br_startblock)
@@ -414,23 +416,23 @@ xfs_iomap_prealloc_size(
/*
* If the size of the extents is greater than half the maximum extent
* length, then use the current offset as the basis. This ensures that
- * for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width
- * alignment of real extents.
+ * for large files the preallocation size always extends to
+ * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
+ * unit/width alignment of real extents.
*/
alloc_blocks = plen * 2;
- if (alloc_blocks > MAXEXTLEN)
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
alloc_blocks = XFS_B_TO_FSB(mp, offset);
qblocks = alloc_blocks;
/*
- * MAXEXTLEN is not a power of two value but we round the prealloc down
- * to the nearest power of two value after throttling. To prevent the
- * round down from unconditionally reducing the maximum supported
- * prealloc size, we round up first, apply appropriate throttling,
- * round down and cap the value to MAXEXTLEN.
+ * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
+ * down to the nearest power of two value after throttling. To prevent
+ * the round down from unconditionally reducing the maximum supported
+ * prealloc size, we round up first, apply appropriate throttling, round
+ * down and cap the value to XFS_BMBT_MAX_EXTLEN.
*/
- alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
freesp = percpu_counter_read_positive(&mp->m_fdblocks);
@@ -478,14 +480,14 @@ xfs_iomap_prealloc_size(
*/
if (alloc_blocks)
alloc_blocks = rounddown_pow_of_two(alloc_blocks);
- if (alloc_blocks > MAXEXTLEN)
- alloc_blocks = MAXEXTLEN;
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+ alloc_blocks = XFS_MAX_BMBT_EXTLEN;
/*
* If we are still trying to allocate more space than is
* available, squash the prealloc hard. This can happen if we
* have a large file on a small filesystem and the above
- * lowspace thresholds are smaller than MAXEXTLEN.
+ * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
@@ -555,6 +557,9 @@ xfs_iomap_write_unwritten(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
@@ -659,7 +664,7 @@ xfs_ilock_for_iomap(
unsigned flags,
unsigned *lockmode)
{
- unsigned mode = XFS_ILOCK_SHARED;
+ unsigned int mode = *lockmode;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/*
@@ -737,7 +742,7 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -768,7 +773,8 @@ xfs_direct_write_iomap_begin(
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, flags & IOMAP_DIRECT);
+ &lockmode,
+ (flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
@@ -863,6 +869,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
};
static int
+xfs_dax_write_iomap_end(
+ struct inode *inode,
+ loff_t pos,
+ loff_t length,
+ ssize_t written,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (!xfs_is_cow_inode(ip))
+ return 0;
+
+ if (!written) {
+ xfs_reflink_cancel_cow_range(ip, pos, length, true);
+ return 0;
+ }
+
+ return xfs_reflink_end_cow(ip, pos, written);
+}
+
+const struct iomap_ops xfs_dax_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+ .iomap_end = xfs_dax_write_iomap_end,
+};
+
+static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -881,6 +914,7 @@ xfs_buffered_write_iomap_begin(
bool eof = false, cow_eof = false, shared = false;
int allocfork = XFS_DATA_FORK;
int error = 0;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -892,7 +926,9 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
@@ -1167,7 +1203,7 @@ xfs_read_iomap_begin(
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
- unsigned lockmode;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
@@ -1302,12 +1338,12 @@ xfs_xattr_iomap_begin(
lockmode = xfs_ilock_attr_map_shared(ip);
/* if there are no attribute fork or extents, return ENOENT */
- if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) {
+ if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) {
error = -ENOENT;
goto out_unlock;
}
- ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL);
+ ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e88dc162c785..c782e8c0479c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -51,5 +51,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
+extern const struct iomap_ops xfs_dax_write_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b34e8e4344a8..f51c60d7e205 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -13,6 +13,8 @@
#include "xfs_inode.h"
#include "xfs_acl.h"
#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
#include "xfs_trace.h"
@@ -22,6 +24,7 @@
#include "xfs_iomap.h"
#include "xfs_error.h"
#include "xfs_ioctl.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -59,7 +62,7 @@ xfs_initxattrs(
.value = xattr->value,
.valuelen = xattr->value_len,
};
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
if (error < 0)
break;
}
@@ -72,9 +75,8 @@ xfs_initxattrs(
* these attrs can be journalled at inode creation time (along with the
* inode, of course, such that log replay can't cause these to be lost).
*/
-
-STATIC int
-xfs_init_security(
+int
+xfs_inode_init_security(
struct inode *inode,
struct inode *dir,
const struct qstr *qstr)
@@ -119,7 +121,7 @@ xfs_cleanup_inode(
/* Oh, the horror.
* If we can't add the ACL or we fail in
- * xfs_init_security we must back out.
+ * xfs_inode_init_security we must back out.
* ENOSPC can hit here, among other things.
*/
xfs_dentry_to_name(&teardown, dentry);
@@ -205,11 +207,10 @@ xfs_generic_create(
inode = VFS_I(ip);
- error = xfs_init_security(inode, dir, &dentry->d_name);
+ error = xfs_inode_init_security(inode, dir, &dentry->d_name);
if (unlikely(error))
goto out_cleanup_inode;
-#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -220,7 +221,6 @@ xfs_generic_create(
if (error)
goto out_cleanup_inode;
}
-#endif
xfs_setup_iops(ip);
@@ -423,7 +423,7 @@ xfs_vn_symlink(
inode = VFS_I(cip);
- error = xfs_init_security(inode, dir, &dentry->d_name);
+ error = xfs_inode_init_security(inode, dir, &dentry->d_name);
if (unlikely(error))
goto out_cleanup_inode;
@@ -604,6 +604,16 @@ xfs_vn_getattr(
stat->blksize = BLKDEV_IOSIZE;
stat->rdev = inode->i_rdev;
break;
+ case S_IFREG:
+ if (request_mask & STATX_DIOALIGN) {
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->result_mask |= STATX_DIOALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ }
+ fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
@@ -666,13 +676,15 @@ xfs_setattr_nonsize(
uint qflags = 0;
if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
- uid = iattr->ia_uid;
+ uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsuid);
qflags |= XFS_QMOPT_UQUOTA;
} else {
uid = inode->i_uid;
}
if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
- gid = iattr->ia_gid;
+ gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsgid);
qflags |= XFS_QMOPT_GQUOTA;
} else {
gid = inode->i_gid;
@@ -703,13 +715,13 @@ xfs_setattr_nonsize(
* didn't have the inode locked, inode's dquot(s) would have changed
* also.
*/
- if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) &&
- !uid_eq(inode->i_uid, iattr->ia_uid)) {
+ if (XFS_IS_UQUOTA_ON(mp) &&
+ i_uid_needs_update(mnt_userns, iattr, inode)) {
ASSERT(udqp);
old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
}
- if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) &&
- !gid_eq(inode->i_gid, iattr->ia_gid)) {
+ if (XFS_IS_GQUOTA_ON(mp) &&
+ i_gid_needs_update(mnt_userns, iattr, inode)) {
ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
ASSERT(gdqp);
old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
@@ -1279,7 +1291,7 @@ xfs_setup_inode(
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
- if (!XFS_IFORK_Q(ip)) {
+ if (!xfs_inode_has_attr_fork(ip)) {
inode_has_no_xattr(inode);
cache_no_acl(inode);
}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 278949056048..cb5fc68c9ea0 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -17,4 +17,7 @@ extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
int xfs_vn_setattr_size(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *vap);
+int xfs_inode_init_security(struct inode *inode, struct inode *dir,
+ const struct qstr *qstr);
+
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c08c79d9e311..36312b00b164 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -64,6 +64,7 @@ xfs_bulkstat_one_int(
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
struct xfs_bulkstat *buf = bc->buf;
+ xfs_extnum_t nextents;
int error = -EINVAL;
if (xfs_internal_inum(mp, ino))
@@ -102,10 +103,16 @@ xfs_bulkstat_one_int(
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize_blks = ip->i_extsize;
- buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
+
+ nextents = xfs_ifork_nextents(&ip->i_df);
+ if (!(bc->breq->flags & XFS_IBULK_NREXT64))
+ buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
+ else
+ buf->bs_extents64 = nextents;
+
xfs_bulkstat_health(ip, buf);
- buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
- buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+ buf->bs_aextents = xfs_ifork_nextents(&ip->i_af);
+ buf->bs_forkoff = xfs_inode_fork_boff(ip);
buf->bs_version = XFS_BULKSTAT_VERSION_V5;
if (xfs_has_v3inodes(mp)) {
@@ -256,6 +263,7 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
+ unsigned int iwalk_flags = 0;
int error;
if (breq->mnt_userns != &init_user_ns) {
@@ -279,7 +287,10 @@ xfs_bulkstat(
if (error)
goto out;
- error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags,
+ if (breq->flags & XFS_IBULK_SAME_AG)
+ iwalk_flags |= XFS_IWALK_SAME_AG;
+
+ error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 7078d10c9b12..e2d0eba43f35 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -17,7 +17,10 @@ struct xfs_ibulk {
};
/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
+#define XFS_IBULK_SAME_AG (1U << 0)
+
+/* Fill out the bs_extents64 field if set. */
+#define XFS_IBULK_NREXT64 (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
new file mode 100644
index 000000000000..43005ce8bd48
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_ag.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_trace.h"
+#include "xfs_error.h"
+
+struct kmem_cache *xfs_iunlink_cache;
+
+static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_iunlink_item, item);
+}
+
+static void
+xfs_iunlink_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_iunlink_item *iup = IUL_ITEM(lip);
+
+ xfs_perag_put(iup->pag);
+ kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip));
+}
+
+
+static uint64_t
+xfs_iunlink_item_sort(
+ struct xfs_log_item *lip)
+{
+ return IUL_ITEM(lip)->ip->i_ino;
+}
+
+/*
+ * Look up the inode cluster buffer and log the on-disk unlinked inode change
+ * we need to make.
+ */
+static int
+xfs_iunlink_log_dinode(
+ struct xfs_trans *tp,
+ struct xfs_iunlink_item *iup)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip = iup->ip;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ int offset;
+ int error;
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+ if (error)
+ return error;
+ /*
+ * Don't log the unlinked field on stale buffers as this may be the
+ * transaction that frees the inode cluster and relogging the buffer
+ * here will incorrectly remove the stale state.
+ */
+ if (ibp->b_flags & XBF_STALE)
+ goto out;
+
+ dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+
+ /* Make sure the old pointer isn't garbage. */
+ if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno,
+ XFS_INO_TO_AGINO(mp, ip->i_ino),
+ be32_to_cpu(dip->di_next_unlinked), iup->next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(iup->next_agino);
+ offset = ip->i_imap.im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * On precommit, we grab the inode cluster buffer for the inode number we were
+ * passed, then update the next unlinked field for that inode in the buffer and
+ * log the buffer. This ensures that the inode cluster buffer was logged in the
+ * correct order w.r.t. other inode cluster buffers. We can then remove the
+ * iunlink item from the transaction and release it as it is has now served it's
+ * purpose.
+ */
+static int
+xfs_iunlink_item_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_iunlink_item *iup = IUL_ITEM(lip);
+ int error;
+
+ error = xfs_iunlink_log_dinode(tp, iup);
+ list_del(&lip->li_trans);
+ xfs_iunlink_item_release(lip);
+ return error;
+}
+
+static const struct xfs_item_ops xfs_iunlink_item_ops = {
+ .iop_release = xfs_iunlink_item_release,
+ .iop_sort = xfs_iunlink_item_sort,
+ .iop_precommit = xfs_iunlink_item_precommit,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the iunlink item.
+ */
+int
+xfs_iunlink_log_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_iunlink_item *iup;
+
+ ASSERT(xfs_verify_agino_or_null(pag, next_agino));
+ ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked));
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ if (ip->i_next_unlinked == next_agino) {
+ if (next_agino != NULLAGINO)
+ return -EFSCORRUPTED;
+ return 0;
+ }
+
+ iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK,
+ &xfs_iunlink_item_ops);
+
+ iup->ip = ip;
+ iup->next_agino = next_agino;
+ iup->old_agino = ip->i_next_unlinked;
+
+ atomic_inc(&pag->pag_ref);
+ iup->pag = pag;
+
+ xfs_trans_add_item(tp, &iup->item);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &iup->item.li_flags);
+ return 0;
+}
+
diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h
new file mode 100644
index 000000000000..c793cdcaccde
--- /dev/null
+++ b/fs/xfs/xfs_iunlink_item.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020-2022, Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef XFS_IUNLINK_ITEM_H
+#define XFS_IUNLINK_ITEM_H 1
+
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_perag;
+
+/* in memory log item structure */
+struct xfs_iunlink_item {
+ struct xfs_log_item item;
+ struct xfs_inode *ip;
+ struct xfs_perag *pag;
+ xfs_agino_t next_agino;
+ xfs_agino_t old_agino;
+};
+
+extern struct kmem_cache *xfs_iunlink_cache;
+
+int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+ struct xfs_perag *pag, xfs_agino_t next_agino);
+
+#endif /* XFS_IUNLINK_ITEM_H */
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 37a795f03267..83699089755e 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
unsigned int inode_records, bool poll, void *data);
/* Only iterate inodes within the same AG as @startino. */
-#define XFS_IWALK_SAME_AG (0x1)
+#define XFS_IWALK_SAME_AG (1U << 0)
#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index cb9105d667db..f9878021e7d0 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -196,7 +196,7 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
}
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
- char *data, unsigned int op);
+ char *data, enum req_op op);
#define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 499e15b24215..386b0307aed8 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -49,7 +49,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclog,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp);
STATIC void
xlog_grant_push_ail(
@@ -58,13 +57,10 @@ xlog_grant_push_ail(
STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog);
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket);
#if defined(DEBUG)
STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr);
-STATIC void
xlog_verify_grant_tail(
struct xlog *log);
STATIC void
@@ -77,7 +73,6 @@ xlog_verify_tail_lsn(
struct xlog *log,
struct xlog_in_core *iclog);
#else
-#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b)
@@ -90,6 +85,62 @@ xlog_iclogs_empty(
static int
xfs_log_cover(struct xfs_mount *);
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+ struct xlog_op_header *oph;
+ uint32_t len;
+ void *buf;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+ lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ sizeof(struct xlog_op_header);
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ oph = vec->i_addr;
+ oph->oh_clientid = XFS_TRANSACTION;
+ oph->oh_res2 = 0;
+ oph->oh_flags = 0;
+
+ buf = vec->i_addr + sizeof(struct xlog_op_header);
+ ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return buf;
+}
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -322,30 +373,6 @@ xlog_grant_head_check(
return error;
}
-static void
-xlog_tic_reset_res(xlog_ticket_t *tic)
-{
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- tic->t_res_num_ophdrs = 0;
-}
-
-static void
-xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
-{
- if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
- /* add to overflow and start again */
- tic->t_res_o_flow += tic->t_res_arr_sum;
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- }
-
- tic->t_res_arr[tic->t_res_num].r_len = len;
- tic->t_res_arr[tic->t_res_num].r_type = type;
- tic->t_res_arr_sum += len;
- tic->t_res_num++;
-}
-
bool
xfs_log_writable(
struct xfs_mount *mp)
@@ -395,8 +422,6 @@ xfs_log_regrant(
xlog_grant_push_ail(log, tic->t_unit_res);
tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
if (tic->t_cnt > 0)
return 0;
@@ -434,10 +459,9 @@ out_error:
int
xfs_log_reserve(
struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
+ int unit_bytes,
+ int cnt,
struct xlog_ticket **ticp,
- uint8_t client,
bool permanent)
{
struct xlog *log = mp->m_log;
@@ -445,15 +469,13 @@ xfs_log_reserve(
int need_bytes;
int error = 0;
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -546,7 +568,8 @@ xlog_state_shutdown_callbacks(
int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
xfs_lsn_t tail_lsn;
bool last_ref;
@@ -593,7 +616,7 @@ xlog_state_release_iclog(
trace_xlog_iclog_syncing(iclog, _RET_IP_);
spin_unlock(&log->l_icloglock);
- xlog_sync(log, iclog);
+ xlog_sync(log, iclog, ticket);
spin_lock(&log->l_icloglock);
return 0;
}
@@ -860,7 +883,7 @@ xlog_force_iclog(
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
if (iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
- return xlog_state_release_iclog(iclog->ic_log, iclog);
+ return xlog_state_release_iclog(iclog->ic_log, iclog, NULL);
}
/*
@@ -901,23 +924,39 @@ xlog_write_unmount_record(
struct xlog *log,
struct xlog_ticket *ticket)
{
- struct xfs_unmount_log_format ulf = {
- .magic = XLOG_UNMOUNT_TYPE,
+ struct {
+ struct xlog_op_header ophdr;
+ struct xfs_unmount_log_format ulf;
+ } unmount_rec = {
+ .ophdr = {
+ .oh_clientid = XFS_LOG,
+ .oh_tid = cpu_to_be32(ticket->t_tid),
+ .oh_flags = XLOG_UNMOUNT_TRANS,
+ },
+ .ulf = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ },
};
struct xfs_log_iovec reg = {
- .i_addr = &ulf,
- .i_len = sizeof(ulf),
+ .i_addr = &unmount_rec,
+ .i_len = sizeof(unmount_rec),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
+ LIST_HEAD(lv_chain);
+ list_add(&vec.lv_list, &lv_chain);
+
+ BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_unmount_log_format)) !=
+ sizeof(unmount_rec));
/* account for space used by record data */
- ticket->t_curr_res -= sizeof(ulf);
+ ticket->t_curr_res -= sizeof(unmount_rec);
- return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS);
+ return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len);
}
/*
@@ -933,7 +972,7 @@ xlog_unmount_write(
struct xlog_ticket *tic = NULL;
int error;
- error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+ error = xfs_log_reserve(mp, 600, 1, &tic, 0);
if (error)
goto out_err;
@@ -1584,9 +1623,6 @@ xlog_alloc_log(
GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog->ic_data)
goto out_free_iclog;
-#ifdef DEBUG
- log->l_iclog_bak[i] = &iclog->ic_header;
-#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1602,7 +1638,7 @@ xlog_alloc_log(
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
INIT_LIST_HEAD(&iclog->ic_callbacks);
- iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+ iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1889,9 +1925,17 @@ xlog_write_iclog(
* device cache first to ensure all metadata writeback covered
* by the LSN in this iclog is on stable storage. This is slow,
* but it *must* complete before we issue the external log IO.
+ *
+ * If the flush fails, we cannot conclude that past metadata
+ * writeback from the log succeeded. Repeating the flush is
+ * not possible, hence we must shut down with log IO error to
+ * avoid shutdown re-entering this path and erroring out again.
*/
- if (log->l_targ != log->l_mp->m_ddev_targp)
- blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev);
+ if (log->l_targ != log->l_mp->m_ddev_targp &&
+ blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
+ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ return;
+ }
}
if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
iclog->ic_bio.bi_opf |= REQ_FUA;
@@ -1968,7 +2012,7 @@ xlog_calc_iclog_size(
}
/*
- * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
* fashion. Previously, we should have moved the current iclog
* ptr in the log to point to the next available iclog. This allows further
* write to continue while this code syncs out an iclog ready to go.
@@ -1993,7 +2037,8 @@ xlog_calc_iclog_size(
STATIC void
xlog_sync(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket)
{
unsigned int count; /* byte count of bwrite */
unsigned int roundoff; /* roundoff to BB or stripe */
@@ -2005,12 +2050,20 @@ xlog_sync(
count = xlog_calc_iclog_size(log, iclog, &roundoff);
- /* move grant heads by roundoff in sync */
- xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
- xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ /*
+ * If we have a ticket, account for the roundoff via the ticket
+ * reservation to avoid touching the hot grant heads needlessly.
+ * Otherwise, we have to move grant heads directly.
+ */
+ if (ticket) {
+ ticket->t_curr_res -= roundoff;
+ } else {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+ xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ }
/* put cycle number in every block */
- xlog_pack_data(log, iclog, roundoff);
+ xlog_pack_data(log, iclog, roundoff);
/* real byte length */
size = iclog->ic_offset;
@@ -2060,8 +2113,6 @@ xlog_dealloc_log(
xlog_in_core_t *iclog, *next_iclog;
int i;
- xlog_cil_destroy(log);
-
/*
* Cycle all the iclogbuf locks to make sure all log IO completion
* is done before we tear down these buffers.
@@ -2073,6 +2124,13 @@ xlog_dealloc_log(
iclog = iclog->ic_next;
}
+ /*
+ * Destroy the CIL after waiting for iclog IO completion because an
+ * iclog EIO error will try to shut down the log, which accesses the
+ * CIL to wake up the waiters.
+ */
+ xlog_cil_destroy(log);
+
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
@@ -2111,63 +2169,11 @@ xlog_print_tic_res(
struct xfs_mount *mp,
struct xlog_ticket *ticket)
{
- uint i;
- uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
-
- /* match with XLOG_REG_TYPE_* in xfs_log.h */
-#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[] = {
- REG_TYPE_STR(BFORMAT, "bformat"),
- REG_TYPE_STR(BCHUNK, "bchunk"),
- REG_TYPE_STR(EFI_FORMAT, "efi_format"),
- REG_TYPE_STR(EFD_FORMAT, "efd_format"),
- REG_TYPE_STR(IFORMAT, "iformat"),
- REG_TYPE_STR(ICORE, "icore"),
- REG_TYPE_STR(IEXT, "iext"),
- REG_TYPE_STR(IBROOT, "ibroot"),
- REG_TYPE_STR(ILOCAL, "ilocal"),
- REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
- REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
- REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
- REG_TYPE_STR(QFORMAT, "qformat"),
- REG_TYPE_STR(DQUOT, "dquot"),
- REG_TYPE_STR(QUOTAOFF, "quotaoff"),
- REG_TYPE_STR(LRHEADER, "LR header"),
- REG_TYPE_STR(UNMOUNT, "unmount"),
- REG_TYPE_STR(COMMIT, "commit"),
- REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create"),
- REG_TYPE_STR(RUI_FORMAT, "rui_format"),
- REG_TYPE_STR(RUD_FORMAT, "rud_format"),
- REG_TYPE_STR(CUI_FORMAT, "cui_format"),
- REG_TYPE_STR(CUD_FORMAT, "cud_format"),
- REG_TYPE_STR(BUI_FORMAT, "bui_format"),
- REG_TYPE_STR(BUD_FORMAT, "bud_format"),
- };
- BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
-#undef REG_TYPE_STR
-
xfs_warn(mp, "ticket reservation summary:");
- xfs_warn(mp, " unit res = %d bytes",
- ticket->t_unit_res);
- xfs_warn(mp, " current res = %d bytes",
- ticket->t_curr_res);
- xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
- ticket->t_res_arr_sum, ticket->t_res_o_flow);
- xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
- ticket->t_res_num_ophdrs, ophdr_spc);
- xfs_warn(mp, " ophdr + reg = %u bytes",
- ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
- xfs_warn(mp, " num regions = %u",
- ticket->t_res_num);
-
- for (i = 0; i < ticket->t_res_num; i++) {
- uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes", i,
- ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type]),
- ticket->t_res_arr[i].r_len);
- }
+ xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
+ xfs_warn(mp, " original count = %d", ticket->t_ocnt);
+ xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
}
/*
@@ -2220,187 +2226,226 @@ xlog_print_trans(
}
}
+static inline void
+xlog_write_iovec(
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ void *data,
+ uint32_t write_len,
+ int *bytes_left,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
+ ASSERT(*log_offset % sizeof(int32_t) == 0);
+ ASSERT(write_len % sizeof(int32_t) == 0);
+
+ memcpy(iclog->ic_datap + *log_offset, data, write_len);
+ *log_offset += write_len;
+ *bytes_left -= write_len;
+ (*record_cnt)++;
+ *data_cnt += write_len;
+}
+
/*
- * Calculate the potential space needed by the log vector. We may need a start
- * record, and each region gets its own struct xlog_op_header and may need to be
- * double word aligned.
+ * Write log vectors into a single iclog which is guaranteed by the caller
+ * to have enough space to write the entire log vector into.
*/
-static int
-xlog_write_calc_vec_length(
+static void
+xlog_write_full(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector,
- uint optype)
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- struct xfs_log_vec *lv;
- int headers = 0;
- int len = 0;
- int i;
-
- if (optype & XLOG_START_TRANS)
- headers++;
-
- for (lv = log_vector; lv; lv = lv->lv_next) {
- /* we don't write ordered log vectors */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
- continue;
+ int index;
- headers += lv->lv_niovecs;
+ ASSERT(*log_offset + *len <= iclog->ic_size ||
+ iclog->ic_state == XLOG_STATE_WANT_SYNC);
- for (i = 0; i < lv->lv_niovecs; i++) {
- struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+ /*
+ * Ordered log vectors have no regions to write so this
+ * loop will naturally skip them.
+ */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ struct xlog_op_header *ophdr = reg->i_addr;
- len += vecp->i_len;
- xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
- }
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ reg->i_len, len, record_cnt, data_cnt);
}
-
- ticket->t_res_num_ophdrs += headers;
- len += headers * sizeof(struct xlog_op_header);
-
- return len;
}
-static void
-xlog_write_start_rec(
- struct xlog_op_header *ophdr,
- struct xlog_ticket *ticket)
-{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_len = 0;
- ophdr->oh_flags = XLOG_START_TRANS;
- ophdr->oh_res2 = 0;
-}
-
-static xlog_op_header_t *
-xlog_write_setup_ophdr(
- struct xlog *log,
- struct xlog_op_header *ophdr,
+static int
+xlog_write_get_more_iclog_space(
struct xlog_ticket *ticket,
- uint flags)
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_res2 = 0;
-
- /* are we copying a commit or unmount record? */
- ophdr->oh_flags = flags;
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog *log = iclog->ic_log;
+ int error;
- /*
- * We've seen logs corrupted with bad transaction client ids. This
- * makes sure that XFS doesn't generate them on. Turn this into an EIO
- * and shut down the filesystem.
- */
- switch (ophdr->oh_clientid) {
- case XFS_TRANSACTION:
- case XFS_VOLUME:
- case XFS_LOG:
- break;
- default:
- xfs_warn(log->l_mp,
- "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
- ophdr->oh_clientid, ticket);
- return NULL;
- }
+ spin_lock(&log->l_icloglock);
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ error = xlog_state_release_iclog(log, iclog, ticket);
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
- return ophdr;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ log_offset);
+ if (error)
+ return error;
+ *record_cnt = 0;
+ *data_cnt = 0;
+ *iclogp = iclog;
+ return 0;
}
/*
- * Set up the parameters of the region copy into the log. This has
- * to handle region write split across multiple log buffers - this
- * state is kept external to this function so that this code can
- * be written in an obvious, self documenting manner.
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
*/
static int
-xlog_write_setup_copy(
+xlog_write_partial(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xlog_op_header *ophdr,
- int space_available,
- int space_required,
- int *copy_off,
- int *copy_len,
- int *last_was_partial_copy,
- int *bytes_consumed)
-{
- int still_to_copy;
-
- still_to_copy = space_required - *bytes_consumed;
- *copy_off = *bytes_consumed;
-
- if (still_to_copy <= space_available) {
- /* write of region completes here */
- *copy_len = still_to_copy;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- if (*last_was_partial_copy)
- ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
- *last_was_partial_copy = 0;
- *bytes_consumed = 0;
- return 0;
- }
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog_op_header *ophdr;
+ int index = 0;
+ uint32_t rlen;
+ int error;
- /* partial write of region, needs extra log op header reservation */
- *copy_len = space_available;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (*last_was_partial_copy)
- ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
- *bytes_consumed += *copy_len;
- (*last_was_partial_copy)++;
+ /* walk the logvec, copying until we run out of space in the iclog */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ uint32_t reg_offset = 0;
- /* account for new log op header */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
- ticket->t_res_num_ophdrs++;
+ /*
+ * The first region of a continuation must have a non-zero
+ * length otherwise log recovery will just skip over it and
+ * start recovering from the next opheader it finds. Because we
+ * mark the next opheader as a continuation, recovery will then
+ * incorrectly add the continuation to the previous region and
+ * that breaks stuff.
+ *
+ * Hence if there isn't space for region data after the
+ * opheader, then we need to start afresh with a new iclog.
+ */
+ if (iclog->ic_size - *log_offset <=
+ sizeof(struct xlog_op_header)) {
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset, *len, record_cnt,
+ data_cnt);
+ if (error)
+ return error;
+ }
- return sizeof(struct xlog_op_header);
-}
+ ophdr = reg->i_addr;
+ rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
-static int
-xlog_write_copy_finish(
- struct xlog *log,
- struct xlog_in_core *iclog,
- uint flags,
- int *record_cnt,
- int *data_cnt,
- int *partial_copy,
- int *partial_copy_len,
- int log_offset)
-{
- int error;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+ if (rlen != reg->i_len)
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ rlen, len, record_cnt, data_cnt);
+
+ /* If we wrote the whole region, move to the next. */
+ if (rlen == reg->i_len)
+ continue;
- if (*partial_copy) {
/*
- * This iclog has already been marked WANT_SYNC by
- * xlog_state_get_iclog_space.
+ * We now have a partially written iovec, but it can span
+ * multiple iclogs so we loop here. First we release the iclog
+ * we currently have, then we get a new iclog and add a new
+ * opheader. Then we continue copying from where we were until
+ * we either complete the iovec or fill the iclog. If we
+ * complete the iovec, then we increment the index and go right
+ * back to the top of the outer loop. if we fill the iclog, we
+ * run the inner loop again.
+ *
+ * This is complicated by the tail of a region using all the
+ * space in an iclog and hence requiring us to release the iclog
+ * and get a new one before returning to the outer loop. We must
+ * always guarantee that we exit this inner loop with at least
+ * space for log transaction opheaders left in the current
+ * iclog, hence we cannot just terminate the loop at the end
+ * of the of the continuation. So we loop while there is no
+ * space left in the current iclog, and check for the end of the
+ * continuation after getting a new iclog.
*/
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
- goto release_iclog;
- }
+ do {
+ /*
+ * Ensure we include the continuation opheader in the
+ * space we need in the new iclog by adding that size
+ * to the length we require. This continuation opheader
+ * needs to be accounted to the ticket as the space it
+ * consumes hasn't been accounted to the lv we are
+ * writing.
+ */
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset,
+ *len + sizeof(struct xlog_op_header),
+ record_cnt, data_cnt);
+ if (error)
+ return error;
- *partial_copy = 0;
- *partial_copy_len = 0;
+ ophdr = iclog->ic_datap + *log_offset;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = XFS_TRANSACTION;
+ ophdr->oh_res2 = 0;
+ ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
- if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t))
- return 0;
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
+ *log_offset += sizeof(struct xlog_op_header);
+ *data_cnt += sizeof(struct xlog_op_header);
- /* no more space in this iclog - push it. */
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
+ /*
+ * If rlen fits in the iclog, then end the region
+ * continuation. Otherwise we're going around again.
+ */
+ reg_offset += rlen;
+ rlen = reg->i_len - reg_offset;
+ if (rlen <= iclog->ic_size - *log_offset)
+ ophdr->oh_flags |= XLOG_END_TRANS;
+ else
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- xlog_is_shutdown(log));
-release_iclog:
- error = xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- return error;
+ rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+ ophdr->oh_len = cpu_to_be32(rlen);
+
+ xlog_write_iovec(iclog, log_offset,
+ reg->i_addr + reg_offset,
+ rlen, len, record_cnt, data_cnt);
+
+ } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
+ }
+
+ /*
+ * No more iovecs remain in this logvec so return the next log vec to
+ * the caller so it can go back to fast path copying.
+ */
+ *iclogp = iclog;
+ return 0;
}
/*
@@ -2447,29 +2492,18 @@ int
xlog_write(
struct xlog *log,
struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *log_vector,
+ struct list_head *lv_chain,
struct xlog_ticket *ticket,
- uint optype)
+ uint32_t len)
+
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_vec *lv = log_vector;
- struct xfs_log_iovec *vecp = lv->lv_iovecp;
- int index = 0;
- int len;
- int partial_copy = 0;
- int partial_copy_len = 0;
- int contwr = 0;
- int record_cnt = 0;
- int data_cnt = 0;
+ struct xfs_log_vec *lv;
+ uint32_t record_cnt = 0;
+ uint32_t data_cnt = 0;
int error = 0;
+ int log_offset;
- /*
- * If this is a commit or unmount transaction, we don't need a start
- * record to be written. We do, however, have to account for the
- * commit or unmount header that gets written. Hence we always have
- * to account for an extra xlog_op_header here.
- */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2477,145 +2511,54 @@ xlog_write(
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
- len = xlog_write_calc_vec_length(ticket, log_vector, optype);
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- void *ptr;
- int log_offset;
-
- error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
- &contwr, &log_offset);
- if (error)
- return error;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &log_offset);
+ if (error)
+ return error;
- ASSERT(log_offset <= iclog->ic_size - 1);
- ptr = iclog->ic_datap + log_offset;
+ ASSERT(log_offset <= iclog->ic_size - 1);
- /*
- * If we have a context pointer, pass it the first iclog we are
- * writing to so it can record state needed for iclog write
- * ordering.
- */
- if (ctx) {
- xlog_cil_set_ctx_write_state(ctx, iclog);
- ctx = NULL;
- }
+ /*
+ * If we have a context pointer, pass it the first iclog we are
+ * writing to so it can record state needed for iclog write
+ * ordering.
+ */
+ if (ctx)
+ xlog_cil_set_ctx_write_state(ctx, iclog);
+ list_for_each_entry(lv, lv_chain, lv_list) {
/*
- * This loop writes out as many regions as can fit in the amount
- * of space which was allocated by xlog_state_get_iclog_space().
+ * If the entire log vec does not fit in the iclog, punt it to
+ * the partial copy loop which can handle this case.
*/
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- struct xfs_log_iovec *reg;
- struct xlog_op_header *ophdr;
- int copy_len;
- int copy_off;
- bool ordered = false;
- bool wrote_start_rec = false;
-
- /* ordered log vectors have no regions to write */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
- ASSERT(lv->lv_niovecs == 0);
- ordered = true;
- goto next_lv;
- }
-
- reg = &vecp[index];
- ASSERT(reg->i_len % sizeof(int32_t) == 0);
- ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
-
- /*
- * Before we start formatting log vectors, we need to
- * write a start record. Only do this for the first
- * iclog we write to.
- */
- if (optype & XLOG_START_TRANS) {
- xlog_write_start_rec(ptr, ticket);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
- optype &= ~XLOG_START_TRANS;
- wrote_start_rec = true;
- }
-
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype);
- if (!ophdr)
- return -EIO;
-
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
-
- len += xlog_write_setup_copy(ticket, ophdr,
- iclog->ic_size-log_offset,
- reg->i_len,
- &copy_off, &copy_len,
- &partial_copy,
- &partial_copy_len);
- xlog_verify_dest_ptr(log, ptr);
-
- /*
- * Copy region.
- *
- * Unmount records just log an opheader, so can have
- * empty payloads with no data region to copy. Hence we
- * only copy the payload if the vector says it has data
- * to copy.
- */
- ASSERT(copy_len >= 0);
- if (copy_len > 0) {
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- copy_len);
- }
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- if (wrote_start_rec) {
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- }
- data_cnt += contwr ? copy_len : 0;
-
- error = xlog_write_copy_finish(log, iclog, optype,
- &record_cnt, &data_cnt,
- &partial_copy,
- &partial_copy_len,
- log_offset);
- if (error)
+ if (lv->lv_niovecs &&
+ lv->lv_bytes > iclog->ic_size - log_offset) {
+ error = xlog_write_partial(lv, ticket, &iclog,
+ &log_offset, &len, &record_cnt,
+ &data_cnt);
+ if (error) {
+ /*
+ * We have no iclog to release, so just return
+ * the error immediately.
+ */
return error;
-
- /*
- * if we had a partial copy, we need to get more iclog
- * space but we don't want to increment the region
- * index because there is still more is this region to
- * write.
- *
- * If we completed writing this region, and we flushed
- * the iclog (indicated by resetting of the record
- * count), then we also need to get more log space. If
- * this was the last record, though, we are done and
- * can just return.
- */
- if (partial_copy)
- break;
-
- if (++index == lv->lv_niovecs) {
-next_lv:
- lv = lv->lv_next;
- index = 0;
- if (lv)
- vecp = lv->lv_iovecp;
- }
- if (record_cnt == 0 && !ordered) {
- if (!lv)
- return 0;
- break;
}
+ } else {
+ xlog_write_full(lv, ticket, iclog, &log_offset,
+ &len, &record_cnt, &data_cnt);
}
}
-
ASSERT(len == 0);
+ /*
+ * We've already been guaranteed that the last writes will fit inside
+ * the current iclog, and hence it will already have the space used by
+ * those writes accounted to it. Hence we do not need to update the
+ * iclog with the number of bytes written here.
+ */
spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- error = xlog_state_release_iclog(log, iclog);
+ xlog_state_finish_copy(log, iclog, record_cnt, 0);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
return error;
@@ -2971,7 +2914,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclogp,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp)
{
int log_offset;
@@ -3008,9 +2950,6 @@ restart:
*/
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
- xlog_tic_add_region(ticket,
- log->l_iclog_hsize,
- XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
@@ -3039,7 +2978,7 @@ restart:
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
if (error)
return error;
@@ -3052,13 +2991,10 @@ restart:
* iclogs (to mark it taken), this particular iclog will release/sync
* to disk in xlog_write().
*/
- if (len <= iclog->ic_size - iclog->ic_offset) {
- *continued_write = 0;
+ if (len <= iclog->ic_size - iclog->ic_offset)
iclog->ic_offset += len;
- } else {
- *continued_write = 1;
+ else
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- }
*iclogp = iclog;
ASSERT(iclog->ic_offset <= iclog->ic_size);
@@ -3090,7 +3026,6 @@ xfs_log_ticket_regrant(
xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
trace_xfs_log_ticket_regrant_sub(log, ticket);
@@ -3101,7 +3036,6 @@ xfs_log_ticket_regrant(
trace_xfs_log_ticket_regrant_exit(log, ticket);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
}
xfs_log_ticket_put(ticket);
@@ -3492,7 +3426,8 @@ xfs_log_ticket_get(
static int
xlog_calc_unit_res(
struct xlog *log,
- int unit_bytes)
+ int unit_bytes,
+ int *niclogs)
{
int iclog_space;
uint num_headers;
@@ -3572,6 +3507,8 @@ xlog_calc_unit_res(
/* roundoff padding for transaction data and one for commit record */
unit_bytes += 2 * log->l_iclog_roundoff;
+ if (niclogs)
+ *niclogs = num_headers;
return unit_bytes;
}
@@ -3580,7 +3517,7 @@ xfs_log_calc_unit_res(
struct xfs_mount *mp,
int unit_bytes)
{
- return xlog_calc_unit_res(mp->m_log, unit_bytes);
+ return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL);
}
/*
@@ -3591,7 +3528,6 @@ xlog_ticket_alloc(
struct xlog *log,
int unit_bytes,
int cnt,
- char client,
bool permanent)
{
struct xlog_ticket *tic;
@@ -3599,7 +3535,7 @@ xlog_ticket_alloc(
tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
- unit_res = xlog_calc_unit_res(log, unit_bytes);
+ unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
@@ -3609,40 +3545,14 @@ xlog_ticket_alloc(
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
- tic->t_clientid = client;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- xlog_tic_reset_res(tic);
-
return tic;
}
#if defined(DEBUG)
/*
- * Make sure that the destination ptr is within the valid data region of
- * one of the iclogs. This uses backup pointers stored in a different
- * part of the log in case we trash the log structure.
- */
-STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr)
-{
- int i;
- int good_ptr = 0;
-
- for (i = 0; i < log->l_iclog_bufs; i++) {
- if (ptr >= log->l_iclog_bak[i] &&
- ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
- good_ptr++;
- }
-
- if (!good_ptr)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
-}
-
-/*
* Check to make sure the grant write head didn't just over lap the tail. If
* the cycles are the same, we can't be overlapping. Otherwise, make sure that
* the cycles differ by exactly one and check the byte count.
@@ -3769,7 +3679,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3780,11 +3690,12 @@ xlog_verify_iclog(
iclog->ic_header.h_cycle_data[idx]);
}
}
- if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+ if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
xfs_warn(log->l_mp,
- "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
- __func__, clientid, ophead,
+ "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+ __func__, i, clientid, ophead,
(unsigned long)field_offset);
+ }
/* check length */
p = &ophead->oh_len;
@@ -3792,8 +3703,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((uintptr_t)&ophead->oh_len -
- (uintptr_t)iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,7 +3739,7 @@ xlog_verify_iclog(
bool
xlog_force_shutdown(
struct xlog *log,
- int shutdown_flags)
+ uint32_t shutdown_flags)
{
bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index dc1b77b92fc1..2728886c2963 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -9,7 +9,8 @@
struct xfs_cil_ctx;
struct xfs_log_vec {
- struct xfs_log_vec *lv_next; /* next lv in build list */
+ struct list_head lv_list; /* CIL lv chain ptrs */
+ uint32_t lv_order_id; /* chain ordering info */
int lv_niovecs; /* number of iovecs in lv */
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
@@ -21,46 +22,59 @@ struct xfs_log_vec {
#define XFS_LOG_VEC_ORDERED (-1)
-static inline void *
-xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- uint type)
+/*
+ * Calculate the log iovec length for a given user buffer length. Intended to be
+ * used by ->iop_size implementations when sizing buffers of arbitrary
+ * alignments.
+ */
+static inline int
+xlog_calc_iovec_len(int len)
{
- struct xfs_log_iovec *vec = *vecp;
+ return roundup(len, sizeof(uint32_t));
+}
+
+void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type);
- if (vec) {
- ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
- vec++;
- } else {
- vec = &lv->lv_iovecp[0];
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
+ int data_len)
+{
+ struct xlog_op_header *oph = vec->i_addr;
+ int len;
+
+ /*
+ * Always round up the length to the correct alignment so callers don't
+ * need to know anything about this log vec layout requirement. This
+ * means we have to zero the area the data to be written does not cover.
+ * This is complicated by fact the payload region is offset into the
+ * logvec region by the opheader that tracks the payload.
+ */
+ len = xlog_calc_iovec_len(data_len);
+ if (len - data_len != 0) {
+ char *buf = vec->i_addr + sizeof(struct xlog_op_header);
+
+ memset(buf + data_len, 0, len - data_len);
}
- vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ /*
+ * The opheader tracks aligned payload length, whilst the logvec tracks
+ * the overall region length.
+ */
+ oph->oh_len = cpu_to_be32(len);
- ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+ len += sizeof(struct xlog_op_header);
+ lv->lv_buf_len += len;
+ lv->lv_bytes += len;
+ vec->i_len = len;
- *vecp = vec;
- return vec->i_addr;
+ /* Catch buffer overruns */
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
}
/*
- * We need to make sure the next buffer is naturally aligned for the biggest
- * basic data type we put into it. We already accounted for this padding when
- * sizing the buffer.
- *
- * However, this padding does not get written into the log, and hence we have to
- * track the space used by the log vectors separately to prevent log space hangs
- * due to inaccurate accounting (i.e. a leak) of the used log space through the
- * CIL context ticket.
+ * Copy the amount of data requested by the caller into a new log iovec.
*/
-static inline void
-xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
-{
- lv->lv_buf_len += round_up(len, sizeof(uint64_t));
- lv->lv_bytes += len;
- vec->i_len = len;
-}
-
static inline void *
xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
uint type, void *data, int len)
@@ -73,6 +87,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return buf;
}
+static inline void *
+xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ const struct xfs_log_iovec *src)
+{
+ return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len);
+}
+
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
@@ -117,15 +138,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
-void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_reserve(struct xfs_mount *mp,
- int length,
- int count,
- struct xlog_ticket **ticket,
- uint8_t clientid,
- bool permanent);
-int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-void xfs_log_unmount(struct xfs_mount *mp);
+void xfs_log_space_wake(struct xfs_mount *mp);
+int xfs_log_reserve(struct xfs_mount *mp, int length, int count,
+ struct xlog_ticket **ticket, bool permanent);
+int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+void xfs_log_unmount(struct xfs_mount *mp);
bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
@@ -140,9 +157,10 @@ void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
-bool xlog_force_shutdown(struct xlog *log, int shutdown_flags);
+bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
void xlog_use_incompat_feat(struct xlog *log);
void xlog_drop_incompat_feat(struct xlog *log);
+int xfs_attr_use_log_assist(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ba57323bfdce..eccbfb99e894 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,16 +37,59 @@ xlog_cil_ticket_alloc(
{
struct xlog_ticket *tic;
- tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
+ tic = xlog_ticket_alloc(log, 0, 1, 0);
/*
* set the current reservation to zero so we know to steal the basic
* transaction overhead reservation from the first transaction commit.
*/
tic->t_curr_res = 0;
+ tic->t_iclog_hdrs = 0;
return tic;
}
+static inline void
+xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil)
+{
+ struct xlog *log = cil->xc_log;
+
+ atomic_set(&cil->xc_iclog_hdrs,
+ (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) /
+ (log->l_iclog_size - log->l_iclog_hsize)));
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+static bool
+xlog_item_in_current_chkpt(
+ struct xfs_cil *cil,
+ struct xfs_log_item *lip)
+{
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
+ return false;
+
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+ return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+}
+
+bool
+xfs_log_item_in_current_chkpt(
+ struct xfs_log_item *lip)
+{
+ return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
+}
+
/*
* Unavoidable forward declaration - xlog_cil_push_work() calls
* xlog_cil_ctx_alloc() itself.
@@ -61,15 +104,88 @@ xlog_cil_ctx_alloc(void)
ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
+ INIT_LIST_HEAD(&ctx->log_items);
+ INIT_LIST_HEAD(&ctx->lv_chain);
INIT_WORK(&ctx->push_work, xlog_cil_push_work);
return ctx;
}
+/*
+ * Aggregate the CIL per cpu structures into global counts, lists, etc and
+ * clear the percpu state ready for the next context to use. This is called
+ * from the push code with the context lock held exclusively, hence nothing else
+ * will be accessing or modifying the per-cpu counters.
+ */
+static void
+xlog_cil_push_pcp_aggregate(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+
+ ctx->ticket->t_curr_res += cilpcp->space_reserved;
+ cilpcp->space_reserved = 0;
+
+ if (!list_empty(&cilpcp->busy_extents)) {
+ list_splice_init(&cilpcp->busy_extents,
+ &ctx->busy_extents);
+ }
+ if (!list_empty(&cilpcp->log_items))
+ list_splice_init(&cilpcp->log_items, &ctx->log_items);
+
+ /*
+ * We're in the middle of switching cil contexts. Reset the
+ * counter we use to detect when the current context is nearing
+ * full.
+ */
+ cilpcp->space_used = 0;
+ }
+}
+
+/*
+ * Aggregate the CIL per-cpu space used counters into the global atomic value.
+ * This is called when the per-cpu counter aggregation will first pass the soft
+ * limit threshold so we can switch to atomic counter aggregation for accurate
+ * detection of hard limit traversal.
+ */
+static void
+xlog_cil_insert_pcp_aggregate(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx)
+{
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
+ int count = 0;
+
+ /* Trigger atomic updates then aggregate only for the first caller */
+ if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
+ return;
+
+ for_each_online_cpu(cpu) {
+ int old, prev;
+
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ do {
+ old = cilpcp->space_used;
+ prev = cmpxchg(&cilpcp->space_used, old, 0);
+ } while (old != prev);
+ count += old;
+ }
+ atomic_add(count, &ctx->space_used);
+}
+
static void
xlog_cil_ctx_switch(
struct xfs_cil *cil,
struct xfs_cil_ctx *ctx)
{
+ xlog_cil_set_iclog_hdr_count(cil);
+ set_bit(XLOG_CIL_EMPTY, &cil->xc_flags);
+ set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags);
ctx->sequence = ++cil->xc_current_sequence;
ctx->cil = cil;
cil->xc_ctx = ctx;
@@ -91,6 +207,7 @@ xlog_cil_init_post_recovery(
{
log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
log->l_cilp->xc_ctx->sequence = 1;
+ xlog_cil_set_iclog_hdr_count(log->l_cilp);
}
static inline int
@@ -103,39 +220,6 @@ xlog_cil_iovec_space(
}
/*
- * shadow buffers can be large, so we need to use kvmalloc() here to ensure
- * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
- * back to vmalloc, so we can't actually do anything useful with gfp flags to
- * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
- * direct reclaim and compaction in the slow path, both of which are
- * horrendously expensive. We just want kmalloc to fail fast and fall back to
- * vmalloc if it can't get somethign straight away from the free lists or buddy
- * allocator. Hence we have to open code kvmalloc outselves here.
- *
- * Also, we are in memalloc_nofs_save task context here, so despite the use of
- * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
- * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
- * just all pretend this is a GFP_KERNEL context operation....
- */
-static inline void *
-xlog_cil_kvmalloc(
- size_t buf_size)
-{
- gfp_t flags = GFP_KERNEL;
- void *p;
-
- flags &= ~__GFP_DIRECT_RECLAIM;
- flags |= __GFP_NOWARN | __GFP_NORETRY;
- do {
- p = kmalloc(buf_size, flags);
- if (!p)
- p = vmalloc(buf_size);
- } while (!p);
-
- return p;
-}
-
-/*
* Allocate or pin log vector buffers for CIL insertion.
*
* The CIL currently uses disposable buffers for copying a snapshot of the
@@ -214,13 +298,20 @@ xlog_cil_alloc_shadow_bufs(
}
/*
- * We 64-bit align the length of each iovec so that the start
- * of the next one is naturally aligned. We'll need to
- * account for that slack space here. Then round nbytes up
- * to 64-bit alignment so that the initial buffer alignment is
- * easy to calculate and verify.
+ * We 64-bit align the length of each iovec so that the start of
+ * the next one is naturally aligned. We'll need to account for
+ * that slack space here.
+ *
+ * We also add the xlog_op_header to each region when
+ * formatting, but that's not accounted to the size of the item
+ * at this point. Hence we'll need an addition number of bytes
+ * for each vector to hold an opheader.
+ *
+ * Then round nbytes up to 64-bit alignment so that the initial
+ * buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs * sizeof(uint64_t);
+ nbytes += niovecs *
+ (sizeof(uint64_t) + sizeof(struct xlog_op_header));
nbytes = round_up(nbytes, sizeof(uint64_t));
/*
@@ -244,10 +335,11 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kmem_free(lip->li_lv_shadow);
- lv = xlog_cil_kvmalloc(buf_size);
+ lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
+ INIT_LIST_HEAD(&lv->lv_list);
lv->lv_item = lip;
lv->lv_size = buf_size;
if (ordered)
@@ -263,7 +355,6 @@ xlog_cil_alloc_shadow_bufs(
else
lv->lv_buf_len = 0;
lv->lv_bytes = 0;
- lv->lv_next = NULL;
}
/* Ensure the lv is set up according to ->iop_size */
@@ -277,22 +368,18 @@ xlog_cil_alloc_shadow_bufs(
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
+ * log space it will consume, and if it is a new item pin it as well.
*/
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
struct xfs_log_vec *lv,
struct xfs_log_vec *old_lv,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
- *diff_iovecs += lv->lv_niovecs;
- }
/*
* If there is no old LV, this is the first time we've seen the item in
@@ -309,7 +396,6 @@ xfs_cil_prepare_item(
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
*diff_len -= old_lv->lv_bytes;
- *diff_iovecs -= old_lv->lv_niovecs;
lv->lv_item->li_lv_shadow = old_lv;
}
@@ -358,12 +444,10 @@ static void
xlog_cil_insert_format_items(
struct xlog *log,
struct xfs_trans *tp,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
struct xfs_log_item *lip;
-
/* Bail out if we didn't find a log item. */
if (list_empty(&tp->t_items)) {
ASSERT(0);
@@ -397,7 +481,6 @@ xlog_cil_insert_format_items(
if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv;
- lv->lv_next = NULL;
if (ordered)
goto insert;
@@ -406,7 +489,6 @@ xlog_cil_insert_format_items(
* set the item up as though it is a new insertion so
* that the space reservation accounting is correct.
*/
- *diff_iovecs -= lv->lv_niovecs;
*diff_len -= lv->lv_bytes;
/* Ensure the lv is set up according to ->iop_size */
@@ -431,11 +513,28 @@ xlog_cil_insert_format_items(
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+ xfs_cil_prepare_item(log, lv, old_lv, diff_len);
}
}
/*
+ * The use of lockless waitqueue_active() requires that the caller has
+ * serialised itself against the wakeup call in xlog_cil_push_work(). That
+ * can be done by either holding the push lock or the context lock.
+ */
+static inline bool
+xlog_cil_over_hard_limit(
+ struct xlog *log,
+ int32_t space_used)
+{
+ if (waitqueue_active(&log->l_cilp->xc_push_wait))
+ return true;
+ if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ return true;
+ return false;
+}
+
+/*
* Insert the log items into the CIL and calculate the difference in space
* consumed by the item. Add the space to the checkpoint ticket and calculate
* if the change requires additional log metadata. If it does, take that space
@@ -445,15 +544,17 @@ insert:
static void
xlog_cil_insert_items(
struct xlog *log,
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ uint32_t released_space)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
struct xfs_log_item *lip;
int len = 0;
- int diff_iovecs = 0;
- int iclog_space;
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
+ int space_used;
+ int order;
+ struct xlog_cil_pcp *cilpcp;
ASSERT(tp);
@@ -461,98 +562,137 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+ xlog_cil_insert_format_items(log, tp, &len);
- spin_lock(&cil->xc_cil_lock);
-
- /* account for space used by new iovec headers */
- iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
- len += iovhdr_res;
- ctx->nvecs += diff_iovecs;
+ /*
+ * Subtract the space released by intent cancelation from the space we
+ * consumed so that we remove it from the CIL space and add it back to
+ * the current transaction reservation context.
+ */
+ len -= released_space;
- /* attach the transaction to the CIL if it has any busy extents */
- if (!list_empty(&tp->t_busy))
- list_splice_init(&tp->t_busy, &ctx->busy_extents);
+ /*
+ * Grab the per-cpu pointer for the CIL before we start any accounting.
+ * That ensures that we are running with pre-emption disabled and so we
+ * can't be scheduled away between split sample/update operations that
+ * are done without outside locking to serialise them.
+ */
+ cilpcp = get_cpu_ptr(cil->xc_pcp);
/*
- * Now transfer enough transaction reservation to the context ticket
- * for the checkpoint. The context ticket is special - the unit
- * reservation has to grow as well as the current reservation as we
- * steal from tickets so we can correctly determine the space used
- * during the transaction commit.
+ * We need to take the CIL checkpoint unit reservation on the first
+ * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't
+ * unnecessarily do an atomic op in the fast path here. We can clear the
+ * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that
+ * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit.
*/
- if (ctx->ticket->t_curr_res == 0) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) &&
+ test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
ctx_res = ctx->ticket->t_unit_res;
- ctx->ticket->t_curr_res = ctx_res;
- tp->t_ticket->t_curr_res -= ctx_res;
- }
- /* do we need space for more log record headers? */
- iclog_space = log->l_iclog_size - log->l_iclog_hsize;
- if (len > 0 && (ctx->space_used / iclog_space !=
- (ctx->space_used + len) / iclog_space)) {
- split_res = (len + iclog_space - 1) / iclog_space;
- /* need to take into account split region headers, too */
- split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
- ctx->ticket->t_unit_res += split_res;
- ctx->ticket->t_curr_res += split_res;
- tp->t_ticket->t_curr_res -= split_res;
- ASSERT(tp->t_ticket->t_curr_res >= len);
+ /*
+ * Check if we need to steal iclog headers. atomic_read() is not a
+ * locked atomic operation, so we can check the value before we do any
+ * real atomic ops in the fast path. If we've already taken the CIL unit
+ * reservation from this commit, we've already got one iclog header
+ * space reserved so we have to account for that otherwise we risk
+ * overrunning the reservation on this ticket.
+ *
+ * If the CIL is already at the hard limit, we might need more header
+ * space that originally reserved. So steal more header space from every
+ * commit that occurs once we are over the hard limit to ensure the CIL
+ * push won't run out of reservation space.
+ *
+ * This can steal more than we need, but that's OK.
+ *
+ * The cil->xc_ctx_lock provides the serialisation necessary for safely
+ * calling xlog_cil_over_hard_limit() in this context.
+ */
+ space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len;
+ if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
+ xlog_cil_over_hard_limit(log, space_used)) {
+ split_res = log->l_iclog_hsize +
+ sizeof(struct xlog_op_header);
+ if (ctx_res)
+ ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1);
+ else
+ ctx_res = split_res * tp->t_ticket->t_iclog_hdrs;
+ atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
}
- tp->t_ticket->t_curr_res -= len;
- ctx->space_used += len;
+ cilpcp->space_reserved += ctx_res;
/*
- * If we've overrun the reservation, dump the tx details before we move
- * the log items. Shutdown is imminent...
+ * Accurately account when over the soft limit, otherwise fold the
+ * percpu count into the global count if over the per-cpu threshold.
*/
- if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
- xfs_warn(log->l_mp, "Transaction log reservation overrun:");
- xfs_warn(log->l_mp,
- " log items: %d bytes (iov hdrs: %d bytes)",
- len, iovhdr_res);
- xfs_warn(log->l_mp, " split region headers: %d bytes",
- split_res);
- xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
- xlog_print_trans(tp);
+ if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) {
+ atomic_add(len, &ctx->space_used);
+ } else if (cilpcp->space_used + len >
+ (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) {
+ space_used = atomic_add_return(cilpcp->space_used + len,
+ &ctx->space_used);
+ cilpcp->space_used = 0;
+
+ /*
+ * If we just transitioned over the soft limit, we need to
+ * transition to the global atomic counter.
+ */
+ if (space_used >= XLOG_CIL_SPACE_LIMIT(log))
+ xlog_cil_insert_pcp_aggregate(cil, ctx);
+ } else {
+ cilpcp->space_used += len;
}
+ /* attach the transaction to the CIL if it has any busy extents */
+ if (!list_empty(&tp->t_busy))
+ list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
/*
- * Now (re-)position everything modified at the tail of the CIL.
+ * Now update the order of everything modified in the transaction
+ * and insert items into the CIL if they aren't already there.
* We do this here so we only need to take the CIL lock once during
* the transaction commit.
*/
+ order = atomic_inc_return(&ctx->order_id);
list_for_each_entry(lip, &tp->t_items, li_trans) {
-
/* Skip items which aren't dirty in this transaction. */
if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
- /*
- * Only move the item if it isn't already at the tail. This is
- * to prevent a transient list_empty() state when reinserting
- * an item that is already the only item in the CIL.
- */
- if (!list_is_last(&lip->li_cil, &cil->xc_cil))
- list_move_tail(&lip->li_cil, &cil->xc_cil);
+ lip->li_order_id = order;
+ if (!list_empty(&lip->li_cil))
+ continue;
+ list_add_tail(&lip->li_cil, &cilpcp->log_items);
}
+ put_cpu_ptr(cilpcp);
- spin_unlock(&cil->xc_cil_lock);
-
- if (tp->t_ticket->t_curr_res < 0)
+ /*
+ * If we've overrun the reservation, dump the tx details before we move
+ * the log items. Shutdown is imminent...
+ */
+ tp->t_ticket->t_curr_res -= ctx_res + len;
+ if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
+ xfs_warn(log->l_mp, "Transaction log reservation overrun:");
+ xfs_warn(log->l_mp,
+ " log items: %d bytes (iov hdrs: %d bytes)",
+ len, iovhdr_res);
+ xfs_warn(log->l_mp, " split region headers: %d bytes",
+ split_res);
+ xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res);
+ xlog_print_trans(tp);
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+ }
}
static void
xlog_cil_free_logvec(
- struct xfs_log_vec *log_vector)
+ struct list_head *lv_chain)
{
struct xfs_log_vec *lv;
- for (lv = log_vector; lv; ) {
- struct xfs_log_vec *next = lv->lv_next;
+ while (!list_empty(lv_chain)) {
+ lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
+ list_del_init(&lv->lv_list);
kmem_free(lv);
- lv = next;
}
}
@@ -605,7 +745,7 @@ xlog_discard_busy_extents(
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, 0, &bio);
+ GFP_NOFS, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
@@ -652,7 +792,7 @@ xlog_cil_committed(
spin_unlock(&ctx->cil->xc_push_lock);
}
- xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+ xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
ctx->start_lsn, abort);
xfs_extent_busy_sort(&ctx->busy_extents);
@@ -663,7 +803,7 @@ xlog_cil_committed(
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_push_lock);
- xlog_cil_free_logvec(ctx->lv_chain);
+ xlog_cil_free_logvec(&ctx->lv_chain);
if (!list_empty(&ctx->busy_extents))
xlog_discard_busy_extents(mp, ctx);
@@ -822,7 +962,7 @@ restart:
static int
xlog_cil_write_chain(
struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *chain)
+ uint32_t chain_len)
{
struct xlog *log = ctx->cil->xc_log;
int error;
@@ -830,7 +970,7 @@ xlog_cil_write_chain(
error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
if (error)
return error;
- return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS);
+ return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len);
}
/*
@@ -844,9 +984,14 @@ xlog_cil_write_commit_record(
struct xfs_cil_ctx *ctx)
{
struct xlog *log = ctx->cil->xc_log;
+ struct xlog_op_header ophdr = {
+ .oh_clientid = XFS_TRANSACTION,
+ .oh_tid = cpu_to_be32(ctx->ticket->t_tid),
+ .oh_flags = XLOG_COMMIT_TRANS,
+ };
struct xfs_log_iovec reg = {
- .i_addr = NULL,
- .i_len = 0,
+ .i_addr = &ophdr,
+ .i_len = sizeof(struct xlog_op_header),
.i_type = XLOG_REG_TYPE_COMMIT,
};
struct xfs_log_vec vec = {
@@ -854,6 +999,8 @@ xlog_cil_write_commit_record(
.lv_iovecp = &reg,
};
int error;
+ LIST_HEAD(lv_chain);
+ list_add(&vec.lv_list, &lv_chain);
if (xlog_is_shutdown(log))
return -EIO;
@@ -862,12 +1009,155 @@ xlog_cil_write_commit_record(
if (error)
return error;
- error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
+ /* account for space used by record data */
+ ctx->ticket->t_curr_res -= reg.i_len;
+ error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len);
if (error)
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
+struct xlog_cil_trans_hdr {
+ struct xlog_op_header oph[2];
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr[2];
+};
+
+/*
+ * Build a checkpoint transaction header to begin the journal transaction. We
+ * need to account for the space used by the transaction header here as it is
+ * not accounted for in xlog_write().
+ *
+ * This is the only place we write a transaction header, so we also build the
+ * log opheaders that indicate the start of a log transaction and wrap the
+ * transaction header. We keep the start record in it's own log vector rather
+ * than compacting them into a single region as this ends up making the logic
+ * in xlog_write() for handling empty opheaders for start, commit and unmount
+ * records much simpler.
+ */
+static void
+xlog_cil_build_trans_hdr(
+ struct xfs_cil_ctx *ctx,
+ struct xlog_cil_trans_hdr *hdr,
+ struct xfs_log_vec *lvhdr,
+ int num_iovecs)
+{
+ struct xlog_ticket *tic = ctx->ticket;
+ __be32 tid = cpu_to_be32(tic->t_tid);
+
+ memset(hdr, 0, sizeof(*hdr));
+
+ /* Log start record */
+ hdr->oph[0].oh_tid = tid;
+ hdr->oph[0].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[0].oh_flags = XLOG_START_TRANS;
+
+ /* log iovec region pointer */
+ hdr->lhdr[0].i_addr = &hdr->oph[0];
+ hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
+ hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
+
+ /* log opheader */
+ hdr->oph[1].oh_tid = tid;
+ hdr->oph[1].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
+
+ /* transaction header in host byte order format */
+ hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+ hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
+ hdr->thdr.th_tid = tic->t_tid;
+ hdr->thdr.th_num_items = num_iovecs;
+
+ /* log iovec region pointer */
+ hdr->lhdr[1].i_addr = &hdr->oph[1];
+ hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_trans_header);
+ hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
+
+ lvhdr->lv_niovecs = 2;
+ lvhdr->lv_iovecp = &hdr->lhdr[0];
+ lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
+
+ tic->t_curr_res -= lvhdr->lv_bytes;
+}
+
+/*
+ * CIL item reordering compare function. We want to order in ascending ID order,
+ * but we want to leave items with the same ID in the order they were added to
+ * the list. This is important for operations like reflink where we log 4 order
+ * dependent intents in a single transaction when we overwrite an existing
+ * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop),
+ * CUI (inc), BUI(remap)...
+ */
+static int
+xlog_cil_order_cmp(
+ void *priv,
+ const struct list_head *a,
+ const struct list_head *b)
+{
+ struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list);
+ struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list);
+
+ return l1->lv_order_id > l2->lv_order_id;
+}
+
+/*
+ * Pull all the log vectors off the items in the CIL, and remove the items from
+ * the CIL. We don't need the CIL lock here because it's only needed on the
+ * transaction commit side which is currently locked out by the flush lock.
+ *
+ * If a log item is marked with a whiteout, we do not need to write it to the
+ * journal and so we just move them to the whiteout list for the caller to
+ * dispose of appropriately.
+ */
+static void
+xlog_cil_build_lv_chain(
+ struct xfs_cil_ctx *ctx,
+ struct list_head *whiteouts,
+ uint32_t *num_iovecs,
+ uint32_t *num_bytes)
+{
+ while (!list_empty(&ctx->log_items)) {
+ struct xfs_log_item *item;
+ struct xfs_log_vec *lv;
+
+ item = list_first_entry(&ctx->log_items,
+ struct xfs_log_item, li_cil);
+
+ if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
+ list_move(&item->li_cil, whiteouts);
+ trace_xfs_cil_whiteout_skip(item);
+ continue;
+ }
+
+ lv = item->li_lv;
+ lv->lv_order_id = item->li_order_id;
+
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ *num_bytes += lv->lv_bytes;
+ *num_iovecs += lv->lv_niovecs;
+ list_add_tail(&lv->lv_list, &ctx->lv_chain);
+
+ list_del_init(&item->li_cil);
+ item->li_order_id = 0;
+ item->li_lv = NULL;
+ }
+}
+
+static void
+xlog_cil_cleanup_whiteouts(
+ struct list_head *whiteouts)
+{
+ while (!list_empty(whiteouts)) {
+ struct xfs_log_item *item = list_first_entry(whiteouts,
+ struct xfs_log_item, li_cil);
+ list_del_init(&item->li_cil);
+ trace_xfs_cil_whiteout_unpin(item);
+ item->li_ops->iop_unpin(item, 1);
+ }
+}
+
/*
* Push the Committed Item List to the log.
*
@@ -890,16 +1180,16 @@ xlog_cil_push_work(
container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
struct xlog *log = cil->xc_log;
- struct xfs_log_vec *lv;
struct xfs_cil_ctx *new_ctx;
- struct xlog_ticket *tic;
- int num_iovecs;
+ int num_iovecs = 0;
+ int num_bytes = 0;
int error = 0;
- struct xfs_trans_header thdr;
- struct xfs_log_iovec lhdr;
- struct xfs_log_vec lvhdr = { NULL };
+ struct xlog_cil_trans_hdr thdr;
+ struct xfs_log_vec lvhdr = {};
xfs_csn_t push_seq;
bool push_commit_stable;
+ LIST_HEAD (whiteouts);
+ struct xlog_ticket *ticket;
new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -923,12 +1213,14 @@ xlog_cil_push_work(
if (waitqueue_active(&cil->xc_push_wait))
wake_up_all(&cil->xc_push_wait);
+ xlog_cil_push_pcp_aggregate(cil, ctx);
+
/*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
*/
- if (list_empty(&cil->xc_cil)) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
cil->xc_push_seq = 0;
spin_unlock(&cil->xc_push_lock);
goto out_skip;
@@ -968,28 +1260,7 @@ xlog_cil_push_work(
list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
- /*
- * Pull all the log vectors off the items in the CIL, and remove the
- * items from the CIL. We don't need the CIL lock here because it's only
- * needed on the transaction commit side which is currently locked out
- * by the flush lock.
- */
- lv = NULL;
- num_iovecs = 0;
- while (!list_empty(&cil->xc_cil)) {
- struct xfs_log_item *item;
-
- item = list_first_entry(&cil->xc_cil,
- struct xfs_log_item, li_cil);
- list_del_init(&item->li_cil);
- if (!ctx->lv_chain)
- ctx->lv_chain = item->li_lv;
- else
- lv->lv_next = item->li_lv;
- lv = item->li_lv;
- item->li_lv = NULL;
- num_iovecs += lv->lv_niovecs;
- }
+ xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes);
/*
* Switch the contexts so we can drop the context lock and move out
@@ -1022,29 +1293,30 @@ xlog_cil_push_work(
up_write(&cil->xc_ctx_lock);
/*
+ * Sort the log vector chain before we add the transaction headers.
+ * This ensures we always have the transaction headers at the start
+ * of the chain.
+ */
+ list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp);
+
+ /*
* Build a checkpoint transaction header and write it to the log to
* begin the transaction. We need to account for the space used by the
* transaction header here as it is not accounted for in xlog_write().
- *
- * The LSN we need to pass to the log items on transaction commit is
- * the LSN reported by the first log vector write. If we use the commit
- * record lsn then we can move the tail beyond the grant write head.
+ * Add the lvhdr to the head of the lv chain we pass to xlog_write() so
+ * it gets written into the iclog first.
+ */
+ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
+ num_bytes += lvhdr.lv_bytes;
+ list_add(&lvhdr.lv_list, &ctx->lv_chain);
+
+ /*
+ * Take the lvhdr back off the lv_chain immediately after calling
+ * xlog_cil_write_chain() as it should not be passed to log IO
+ * completion.
*/
- tic = ctx->ticket;
- thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
- thdr.th_type = XFS_TRANS_CHECKPOINT;
- thdr.th_tid = tic->t_tid;
- thdr.th_num_items = num_iovecs;
- lhdr.i_addr = &thdr;
- lhdr.i_len = sizeof(xfs_trans_header_t);
- lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
- tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
-
- lvhdr.lv_niovecs = 1;
- lvhdr.lv_iovecp = &lhdr;
- lvhdr.lv_next = ctx->lv_chain;
-
- error = xlog_cil_write_chain(ctx, &lvhdr);
+ error = xlog_cil_write_chain(ctx, num_bytes);
+ list_del(&lvhdr.lv_list);
if (error)
goto out_abort_free_ticket;
@@ -1052,7 +1324,14 @@ xlog_cil_push_work(
if (error)
goto out_abort_free_ticket;
- xfs_log_ticket_ungrant(log, tic);
+ /*
+ * Grab the ticket from the ctx so we can ungrant it after releasing the
+ * commit_iclog. The ctx may be freed by the time we return from
+ * releasing the commit_iclog (i.e. checkpoint has been completed and
+ * callback run) so we can't reference the ctx after the call to
+ * xlog_state_release_iclog().
+ */
+ ticket = ctx->ticket;
/*
* If the checkpoint spans multiple iclogs, wait for all previous iclogs
@@ -1102,11 +1381,14 @@ xlog_cil_push_work(
if (push_commit_stable &&
ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
- xlog_state_release_iclog(log, ctx->commit_iclog);
+ ticket = ctx->ticket;
+ xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
+ xlog_cil_cleanup_whiteouts(&whiteouts);
+ xfs_log_ticket_ungrant(log, ticket);
return;
out_skip:
@@ -1116,16 +1398,19 @@ out_skip:
return;
out_abort_free_ticket:
- xfs_log_ticket_ungrant(log, tic);
ASSERT(xlog_is_shutdown(log));
+ xlog_cil_cleanup_whiteouts(&whiteouts);
if (!ctx->commit_iclog) {
+ xfs_log_ticket_ungrant(log, ctx->ticket);
xlog_cil_committed(ctx);
return;
}
spin_lock(&log->l_icloglock);
- xlog_state_release_iclog(log, ctx->commit_iclog);
+ ticket = ctx->ticket;
+ xlog_state_release_iclog(log, ctx->commit_iclog, ticket);
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
+ xfs_log_ticket_ungrant(log, ticket);
}
/*
@@ -1140,18 +1425,27 @@ xlog_cil_push_background(
struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
+ int space_used = atomic_read(&cil->xc_ctx->space_used);
/*
* The cil won't be empty because we are called while holding the
- * context lock so whatever we added to the CIL will still be there
+ * context lock so whatever we added to the CIL will still be there.
*/
- ASSERT(!list_empty(&cil->xc_cil));
+ ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
/*
- * Don't do a background push if we haven't used up all the
- * space available yet.
+ * We are done if:
+ * - we haven't used up all the space available yet; or
+ * - we've already queued up a push; and
+ * - we're not over the hard limit; and
+ * - nothing has been over the hard limit.
+ *
+ * If so, we don't need to take the push lock as there's nothing to do.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+ if (space_used < XLOG_CIL_SPACE_LIMIT(log) ||
+ (cil->xc_push_seq == cil->xc_current_sequence &&
+ space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) &&
+ !waitqueue_active(&cil->xc_push_wait))) {
up_read(&cil->xc_ctx_lock);
return;
}
@@ -1178,12 +1472,11 @@ xlog_cil_push_background(
* dipping back down under the hard limit.
*
* The ctx->xc_push_lock provides the serialisation necessary for safely
- * using the lockless waitqueue_active() check in this context.
+ * calling xlog_cil_over_hard_limit() in this context.
*/
- if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
- waitqueue_active(&cil->xc_push_wait)) {
+ if (xlog_cil_over_hard_limit(log, space_used)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
- ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+ ASSERT(space_used < log->l_logsize);
xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
return;
}
@@ -1242,7 +1535,8 @@ xlog_cil_push_now(
* If the CIL is empty or we've already pushed the sequence then
* there's no more work that we need to do.
*/
- if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) ||
+ push_seq <= cil->xc_push_seq) {
spin_unlock(&cil->xc_push_lock);
return;
}
@@ -1260,13 +1554,50 @@ xlog_cil_empty(
bool empty = false;
spin_lock(&cil->xc_push_lock);
- if (list_empty(&cil->xc_cil))
+ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags))
empty = true;
spin_unlock(&cil->xc_push_lock);
return empty;
}
/*
+ * If there are intent done items in this transaction and the related intent was
+ * committed in the current (same) CIL checkpoint, we don't need to write either
+ * the intent or intent done item to the journal as the change will be
+ * journalled atomically within this checkpoint. As we cannot remove items from
+ * the CIL here, mark the related intent with a whiteout so that the CIL push
+ * can remove it rather than writing it to the journal. Then remove the intent
+ * done item from the current transaction and release it so it doesn't get put
+ * into the CIL at all.
+ */
+static uint32_t
+xlog_cil_process_intents(
+ struct xfs_cil *cil,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *ilip, *next;
+ uint32_t len = 0;
+
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
+ continue;
+
+ ilip = lip->li_ops->iop_intent(lip);
+ if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
+ continue;
+ set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
+ trace_xfs_cil_whiteout_mark(ilip);
+ len += ilip->li_lv->lv_bytes;
+ kmem_free(ilip->li_lv);
+ ilip->li_lv = NULL;
+
+ xfs_trans_del_item(lip);
+ lip->li_ops->iop_release(lip);
+ }
+ return len;
+}
+
+/*
* Commit a transaction with the given vector to the Committed Item List.
*
* To do this, we need to format the item, pin it in memory if required and
@@ -1288,6 +1619,7 @@ xlog_cil_commit(
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_item *lip, *next;
+ uint32_t released_space = 0;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -1299,7 +1631,10 @@ xlog_cil_commit(
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
- xlog_cil_insert_items(log, tp);
+ if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
+ released_space = xlog_cil_process_intents(cil, tp);
+
+ xlog_cil_insert_items(log, tp, released_space);
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1350,7 +1685,7 @@ xlog_cil_flush(
* If the CIL is empty, make sure that any previous checkpoint that may
* still be in an active iclog is pushed to stable storage.
*/
- if (list_empty(&log->l_cilp->xc_cil))
+ if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags))
xfs_log_force(log->l_mp, 0);
}
@@ -1435,7 +1770,7 @@ restart:
* we would have found the context on the committing list.
*/
if (sequence == cil->xc_current_sequence &&
- !list_empty(&cil->xc_cil)) {
+ !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) {
spin_unlock(&cil->xc_push_lock);
goto restart;
}
@@ -1456,29 +1791,35 @@ out_shutdown:
}
/*
- * Check if the current log item was first committed in this sequence.
- * We can't rely on just the log item being in the CIL, we have to check
- * the recorded commit sequence number.
+ * Move dead percpu state to the relevant CIL context structures.
*
- * Note: for this to be used in a non-racy manner, it has to be called with
- * CIL flushing locked out. As a result, it should only be used during the
- * transaction commit process when deciding what to format into the item.
+ * We have to lock the CIL context here to ensure that nothing is modifying
+ * the percpu state, either addition or removal. Both of these are done under
+ * the CIL context lock, so grabbing that exclusively here will ensure we can
+ * safely drain the cilpcp for the CPU that is dying.
*/
-bool
-xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
+void
+xlog_cil_pcp_dead(
+ struct xlog *log,
+ unsigned int cpu)
{
- struct xfs_cil *cil = lip->li_log->l_cilp;
-
- if (list_empty(&lip->li_cil))
- return false;
+ struct xfs_cil *cil = log->l_cilp;
+ struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ struct xfs_cil_ctx *ctx;
- /*
- * li_seq is written on the first commit of a log item to record the
- * first checkpoint it is written to. Hence if it is different to the
- * current sequence, we're in a new checkpoint.
- */
- return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+ down_write(&cil->xc_ctx_lock);
+ ctx = cil->xc_ctx;
+ if (ctx->ticket)
+ ctx->ticket->t_curr_res += cilpcp->space_reserved;
+ cilpcp->space_reserved = 0;
+
+ if (!list_empty(&cilpcp->log_items))
+ list_splice_init(&cilpcp->log_items, &ctx->log_items);
+ if (!list_empty(&cilpcp->busy_extents))
+ list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
+ atomic_add(cilpcp->space_used, &ctx->space_used);
+ cilpcp->space_used = 0;
+ up_write(&cil->xc_ctx_lock);
}
/*
@@ -1486,10 +1827,12 @@ xfs_log_item_in_current_chkpt(
*/
int
xlog_cil_init(
- struct xlog *log)
+ struct xlog *log)
{
- struct xfs_cil *cil;
- struct xfs_cil_ctx *ctx;
+ struct xfs_cil *cil;
+ struct xfs_cil_ctx *ctx;
+ struct xlog_cil_pcp *cilpcp;
+ int cpu;
cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
if (!cil)
@@ -1504,22 +1847,31 @@ xlog_cil_init(
if (!cil->xc_push_wq)
goto out_destroy_cil;
- INIT_LIST_HEAD(&cil->xc_cil);
+ cil->xc_log = log;
+ cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp);
+ if (!cil->xc_pcp)
+ goto out_destroy_wq;
+
+ for_each_possible_cpu(cpu) {
+ cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+ INIT_LIST_HEAD(&cilpcp->busy_extents);
+ INIT_LIST_HEAD(&cilpcp->log_items);
+ }
+
INIT_LIST_HEAD(&cil->xc_committing);
- spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_start_wait);
init_waitqueue_head(&cil->xc_commit_wait);
- cil->xc_log = log;
log->l_cilp = cil;
ctx = xlog_cil_ctx_alloc();
xlog_cil_ctx_switch(cil, ctx);
-
return 0;
+out_destroy_wq:
+ destroy_workqueue(cil->xc_push_wq);
out_destroy_cil:
kmem_free(cil);
return -ENOMEM;
@@ -1529,14 +1881,17 @@ void
xlog_cil_destroy(
struct xlog *log)
{
- if (log->l_cilp->xc_ctx) {
- if (log->l_cilp->xc_ctx->ticket)
- xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
- kmem_free(log->l_cilp->xc_ctx);
+ struct xfs_cil *cil = log->l_cilp;
+
+ if (cil->xc_ctx) {
+ if (cil->xc_ctx->ticket)
+ xfs_log_ticket_put(cil->xc_ctx->ticket);
+ kmem_free(cil->xc_ctx);
}
- ASSERT(list_empty(&log->l_cilp->xc_cil));
- destroy_workqueue(log->l_cilp->xc_push_wq);
- kmem_free(log->l_cilp);
+ ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
+ free_percpu(cil->xc_pcp);
+ destroy_workqueue(cil->xc_push_wq);
+ kmem_free(cil);
}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 401cdc400980..1bd2963e8fbd 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,8 +51,8 @@ enum xlog_iclog_state {
/*
* In core log flags
*/
-#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
-#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
+#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */
#define XLOG_ICL_STRINGS \
{ XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
@@ -62,7 +62,7 @@ enum xlog_iclog_state {
/*
* Log ticket flags
*/
-#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */
#define XLOG_TIC_FLAGS \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
@@ -142,37 +142,17 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-/* Ticket reservation region accounting */
-#define XLOG_TIC_LEN_MAX 15
-
-/*
- * Reservation region
- * As would be stored in xfs_log_iovec but without the i_addr which
- * we don't care about.
- */
-typedef struct xlog_res {
- uint r_len; /* region length :4 */
- uint r_type; /* region's transaction type :4 */
-} xlog_res_t;
-
typedef struct xlog_ticket {
- struct list_head t_queue; /* reserve/write queue */
- struct task_struct *t_task; /* task that owns this ticket */
- xlog_tid_t t_tid; /* transaction identifier : 4 */
- atomic_t t_ref; /* ticket reference count : 4 */
- int t_curr_res; /* current reservation in bytes : 4 */
- int t_unit_res; /* unit reservation in bytes : 4 */
- char t_ocnt; /* original count : 1 */
- char t_cnt; /* current count : 1 */
- char t_clientid; /* who does this belong to; : 1 */
- char t_flags; /* properties of reservation : 1 */
-
- /* reservation array fields */
- uint t_res_num; /* num in array : 4 */
- uint t_res_num_ophdrs; /* num op hdrs : 4 */
- uint t_res_arr_sum; /* array sum : 4 */
- uint t_res_o_flow; /* sum overflow : 4 */
- xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
+ struct list_head t_queue; /* reserve/write queue */
+ struct task_struct *t_task; /* task that owns this ticket */
+ xlog_tid_t t_tid; /* transaction identifier */
+ atomic_t t_ref; /* ticket reference count */
+ int t_curr_res; /* current reservation */
+ int t_unit_res; /* unit reservation */
+ char t_ocnt; /* original unit count */
+ char t_cnt; /* current unit count */
+ uint8_t t_flags; /* properties of reservation */
+ int t_iclog_hdrs; /* iclog hdrs in t_curr_res */
} xlog_ticket_t;
/*
@@ -211,7 +191,7 @@ typedef struct xlog_in_core {
u32 ic_offset;
enum xlog_iclog_state ic_state;
unsigned int ic_flags;
- char *ic_datap; /* pointer to iclog data */
+ void *ic_datap; /* pointer to iclog data */
struct list_head ic_callbacks;
/* reference counts need their own cacheline */
@@ -242,14 +222,25 @@ struct xfs_cil_ctx {
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
- int nvecs; /* number of regions */
- int space_used; /* aggregate size of regions */
+ atomic_t space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
- struct xfs_log_vec *lv_chain; /* logvecs being pushed */
+ struct list_head log_items; /* log items in chkpt */
+ struct list_head lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
struct work_struct push_work;
+ atomic_t order_id;
+};
+
+/*
+ * Per-cpu CIL tracking items
+ */
+struct xlog_cil_pcp {
+ int32_t space_used;
+ uint32_t space_reserved;
+ struct list_head busy_extents;
+ struct list_head log_items;
};
/*
@@ -270,8 +261,8 @@ struct xfs_cil_ctx {
*/
struct xfs_cil {
struct xlog *xc_log;
- struct list_head xc_cil;
- spinlock_t xc_cil_lock;
+ unsigned long xc_flags;
+ atomic_t xc_iclog_hdrs;
struct workqueue_struct *xc_push_wq;
struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
@@ -285,8 +276,17 @@ struct xfs_cil {
wait_queue_head_t xc_start_wait;
xfs_csn_t xc_current_sequence;
wait_queue_head_t xc_push_wait; /* background push throttle */
+
+ void __percpu *xc_pcp; /* percpu CIL structures */
+#ifdef CONFIG_HOTPLUG_CPU
+ struct list_head xc_pcp_list;
+#endif
} ____cacheline_aligned_in_smp;
+/* xc_flags bit values */
+#define XLOG_CIL_EMPTY 1
+#define XLOG_CIL_PCP_SPACE 2
+
/*
* The amount of log space we allow the CIL to aggregate is difficult to size.
* Whatever we choose, we have to make sure we can get a reservation for the
@@ -441,10 +441,6 @@ struct xlog {
struct xfs_kobj l_kobj;
- /* The following field are used for debugging; need to hold icloglock */
-#ifdef DEBUG
- void *l_iclog_bak[XLOG_MAX_ICLOGS];
-#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
@@ -454,9 +450,6 @@ struct xlog {
struct rw_semaphore l_incompat_users;
};
-#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
- ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
-
/*
* Bits for operational state
*/
@@ -509,33 +502,21 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
extern struct kmem_cache *xfs_log_ticket_cache;
-struct xlog_ticket *
-xlog_ticket_alloc(
- struct xlog *log,
- int unit_bytes,
- int count,
- char client,
- bool permanent);
-
-static inline void
-xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
-{
- *ptr += bytes;
- *len -= bytes;
- *off += bytes;
-}
+struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
+ int count, bool permanent);
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
- uint optype);
+ struct list_head *lv_chain, struct xlog_ticket *tic,
+ uint32_t len);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
int eventual_size);
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+ struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
@@ -690,4 +671,43 @@ xlog_valid_lsn(
return valid;
}
+/*
+ * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
+ * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
+ * to fall back to vmalloc, so we can't actually do anything useful with gfp
+ * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
+ * will do direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or
+ * buddy allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * This assumes that the caller uses memalloc_nofs_save task context here, so
+ * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
+ * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
+ * allocations, so lets just all pretend this is a GFP_KERNEL context
+ * operation....
+ */
+static inline void *
+xlog_kvmalloc(
+ size_t buf_size)
+{
+ gfp_t flags = GFP_KERNEL;
+ void *p;
+
+ flags &= ~__GFP_DIRECT_RECLAIM;
+ flags |= __GFP_NOWARN | __GFP_NORETRY;
+ do {
+ p = kmalloc(buf_size, flags);
+ if (!p)
+ p = vmalloc(buf_size);
+ } while (!p);
+
+ return p;
+}
+
+/*
+ * CIL CPU dead notifier
+ */
+void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c4ad4296c540..17e923b9c5fa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -39,13 +39,6 @@ STATIC int
xlog_clear_stale_blocks(
struct xlog *,
xfs_lsn_t);
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_summary(
- struct xlog *);
-#else
-#define xlog_recover_check_summary(log)
-#endif
STATIC int
xlog_do_recovery_pass(
struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
@@ -129,7 +122,7 @@ xlog_do_io(
xfs_daddr_t blk_no,
unsigned int nbblks,
char *data,
- unsigned int op)
+ enum req_op op)
{
int error;
@@ -1800,6 +1793,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_cud_item_ops,
&xlog_bui_item_ops,
&xlog_bud_item_ops,
+ &xlog_attri_item_ops,
+ &xlog_attrd_item_ops,
};
static const struct xlog_recover_item_ops *
@@ -2634,21 +2629,21 @@ xlog_recover_cancel_intents(
*/
STATIC void
xlog_recover_clear_agi_bucket(
- xfs_mount_t *mp,
- xfs_agnumber_t agno,
- int bucket)
+ struct xfs_perag *pag,
+ int bucket)
{
- xfs_trans_t *tp;
- xfs_agi_t *agi;
- struct xfs_buf *agibp;
- int offset;
- int error;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_trans *tp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ int offset;
+ int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
if (error)
goto out_error;
- error = xfs_read_agi(mp, tp, agno, &agibp);
+ error = xfs_read_agi(pag, tp, &agibp);
if (error)
goto out_abort;
@@ -2667,60 +2662,62 @@ xlog_recover_clear_agi_bucket(
out_abort:
xfs_trans_cancel(tp);
out_error:
- xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
+ xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
+ pag->pag_agno);
return;
}
-STATIC xfs_agino_t
-xlog_recover_process_one_iunlink(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agino_t agino,
- int bucket)
+static int
+xlog_recover_iunlink_bucket(
+ struct xfs_perag *pag,
+ struct xfs_agi *agi,
+ int bucket)
{
- struct xfs_buf *ibp;
- struct xfs_dinode *dip;
- struct xfs_inode *ip;
- xfs_ino_t ino;
- int error;
-
- ino = XFS_AGINO_TO_INO(mp, agno, agino);
- error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
- if (error)
- goto fail;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *prev_ip = NULL;
+ struct xfs_inode *ip;
+ xfs_agino_t prev_agino, agino;
+ int error = 0;
- /*
- * Get the on disk inode to find the next inode in the bucket.
- */
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &ibp);
- if (error)
- goto fail_iput;
- dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
+ agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (agino != NULLAGINO) {
+ error = xfs_iget(mp, NULL,
+ XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
+ 0, 0, &ip);
+ if (error)
+ break;
- xfs_iflags_clear(ip, XFS_IRECOVERY);
- ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(VFS_I(ip)->i_mode != 0);
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ xfs_iflags_clear(ip, XFS_IRECOVERY);
+ agino = ip->i_next_unlinked;
- /* setup for the next pass */
- agino = be32_to_cpu(dip->di_next_unlinked);
- xfs_buf_relse(ibp);
+ if (prev_ip) {
+ ip->i_prev_unlinked = prev_agino;
+ xfs_irele(prev_ip);
- xfs_irele(ip);
- return agino;
+ /*
+ * Ensure the inode is removed from the unlinked list
+ * before we continue so that it won't race with
+ * building the in-memory list here. This could be
+ * serialised with the agibp lock, but that just
+ * serialises via lockstepping and it's much simpler
+ * just to flush the inodegc queue and wait for it to
+ * complete.
+ */
+ xfs_inodegc_flush(mp);
+ }
- fail_iput:
- xfs_irele(ip);
- fail:
- /*
- * We can't read in the inode this bucket points to, or this inode
- * is messed up. Just ditch this bucket of inodes. We will lose
- * some inodes and space, but at least we won't hang.
- *
- * Call xlog_recover_clear_agi_bucket() to perform a transaction to
- * clear the inode pointer in the bucket.
- */
- xlog_recover_clear_agi_bucket(mp, agno, bucket);
- return NULLAGINO;
+ prev_agino = agino;
+ prev_ip = ip;
+ }
+
+ if (prev_ip) {
+ ip->i_prev_unlinked = prev_agino;
+ xfs_irele(prev_ip);
+ }
+ xfs_inodegc_flush(mp);
+ return error;
}
/*
@@ -2746,59 +2743,70 @@ xlog_recover_process_one_iunlink(
* scheduled on this CPU to ensure other scheduled work can run without undue
* latency.
*/
-STATIC void
-xlog_recover_process_iunlinks(
- struct xlog *log)
+static void
+xlog_recover_iunlink_ag(
+ struct xfs_perag *pag)
{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
struct xfs_agi *agi;
struct xfs_buf *agibp;
- xfs_agino_t agino;
int bucket;
int error;
- for_each_perag(mp, agno, pag) {
- error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp);
+ error = xfs_read_agi(pag, NULL, &agibp);
+ if (error) {
+ /*
+ * AGI is b0rked. Don't process it.
+ *
+ * We should probably mark the filesystem as corrupt after we've
+ * recovered all the ag's we can....
+ */
+ return;
+ }
+
+ /*
+ * Unlock the buffer so that it can be acquired in the normal course of
+ * the transaction to truncate and free each inode. Because we are not
+ * racing with anyone else here for the AGI buffer, we don't even need
+ * to hold it locked to read the initial unlinked bucket entries out of
+ * the buffer. We keep buffer reference though, so that it stays pinned
+ * in memory while we need the buffer.
+ */
+ agi = agibp->b_addr;
+ xfs_buf_unlock(agibp);
+
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+ error = xlog_recover_iunlink_bucket(pag, agi, bucket);
if (error) {
/*
- * AGI is b0rked. Don't process it.
- *
- * We should probably mark the filesystem as corrupt
- * after we've recovered all the ag's we can....
+ * Bucket is unrecoverable, so only a repair scan can
+ * free the remaining unlinked inodes. Just empty the
+ * bucket and remaining inodes on it unreferenced and
+ * unfreeable.
*/
- continue;
+ xfs_inodegc_flush(pag->pag_mount);
+ xlog_recover_clear_agi_bucket(pag, bucket);
}
- /*
- * Unlock the buffer so that it can be acquired in the normal
- * course of the transaction to truncate and free each inode.
- * Because we are not racing with anyone else here for the AGI
- * buffer, we don't even need to hold it locked to read the
- * initial unlinked bucket entries out of the buffer. We keep
- * buffer reference though, so that it stays pinned in memory
- * while we need the buffer.
- */
- agi = agibp->b_addr;
- xfs_buf_unlock(agibp);
-
- for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
- agino = be32_to_cpu(agi->agi_unlinked[bucket]);
- while (agino != NULLAGINO) {
- agino = xlog_recover_process_one_iunlink(mp,
- pag->pag_agno, agino, bucket);
- cond_resched();
- }
- }
- xfs_buf_rele(agibp);
}
+ xfs_buf_rele(agibp);
+}
+
+static void
+xlog_recover_process_iunlinks(
+ struct xlog *log)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for_each_perag(log->l_mp, agno, pag)
+ xlog_recover_iunlink_ag(pag);
+
/*
* Flush the pending unlinked inodes to ensure that the inactivations
* are fully completed on disk and the incore inodes can be reclaimed
* before we signal that recovery is complete.
*/
- xfs_inodegc_flush(mp);
+ xfs_inodegc_flush(log->l_mp);
}
STATIC void
@@ -3228,7 +3236,7 @@ xlog_do_log_recovery(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
- int error, i;
+ int error;
ASSERT(head_blk != tail_blk);
@@ -3236,37 +3244,25 @@ xlog_do_log_recovery(
* First do a pass to find all of the cancelled buf log items.
* Store them in the buf_cancel_table for use in the second pass.
*/
- log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
- sizeof(struct list_head),
- 0);
- for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+ error = xlog_alloc_buf_cancel_table(log);
+ if (error)
+ return error;
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS1, NULL);
- if (error != 0) {
- kmem_free(log->l_buf_cancel_table);
- log->l_buf_cancel_table = NULL;
- return error;
- }
+ if (error != 0)
+ goto out_cancel;
+
/*
* Then do a second pass to actually recover the items in the log.
* When it is complete free the table of buf cancel items.
*/
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS2, NULL);
-#ifdef DEBUG
- if (!error) {
- int i;
-
- for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- ASSERT(list_empty(&log->l_buf_cancel_table[i]));
- }
-#endif /* DEBUG */
-
- kmem_free(log->l_buf_cancel_table);
- log->l_buf_cancel_table = NULL;
-
+ if (!error)
+ xlog_check_buf_cancel_table(log);
+out_cancel:
+ xlog_free_buf_cancel_table(log);
return error;
}
@@ -3330,15 +3326,14 @@ xlog_do_recover(
/* re-initialise in-core superblock and geometry structures */
mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
- error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
+ &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
return error;
}
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
- xlog_recover_check_summary(log);
-
/* Normal transactions can now occur */
clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
return 0;
@@ -3481,7 +3476,6 @@ xlog_recover_finish(
}
xlog_recover_process_iunlinks(log);
- xlog_recover_check_summary(log);
/*
* Recover any CoW staging blocks that are still referenced by the
@@ -3515,52 +3509,3 @@ xlog_recover_cancel(
xlog_recover_cancel_intents(log);
}
-#if defined(DEBUG)
-/*
- * Read all of the agf and agi counters and check that they
- * are consistent with the superblock counters.
- */
-STATIC void
-xlog_recover_check_summary(
- struct xlog *log)
-{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_perag *pag;
- struct xfs_buf *agfbp;
- struct xfs_buf *agibp;
- xfs_agnumber_t agno;
- uint64_t freeblks;
- uint64_t itotal;
- uint64_t ifree;
- int error;
-
- freeblks = 0LL;
- itotal = 0LL;
- ifree = 0LL;
- for_each_perag(mp, agno, pag) {
- error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp);
- if (error) {
- xfs_alert(mp, "%s agf read failed agno %d error %d",
- __func__, pag->pag_agno, error);
- } else {
- struct xfs_agf *agfp = agfbp->b_addr;
-
- freeblks += be32_to_cpu(agfp->agf_freeblks) +
- be32_to_cpu(agfp->agf_flcount);
- xfs_buf_relse(agfbp);
- }
-
- error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp);
- if (error) {
- xfs_alert(mp, "%s agi read failed agno %d error %d",
- __func__, pag->pag_agno, error);
- } else {
- struct xfs_agi *agi = agibp->b_addr;
-
- itotal += be32_to_cpu(agi->agi_count);
- ifree += be32_to_cpu(agi->agi_freecount);
- xfs_buf_relse(agibp);
- }
- }
-}
-#endif /* DEBUG */
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index bc66d95c8d4c..8f495cc23903 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -27,42 +27,34 @@ __xfs_printk(
printk("%sXFS: %pV\n", level, vaf);
}
-#define define_xfs_printk_level(func, kern_level) \
-void func(const struct xfs_mount *mp, const char *fmt, ...) \
-{ \
- struct va_format vaf; \
- va_list args; \
- int level; \
- \
- va_start(args, fmt); \
- \
- vaf.fmt = fmt; \
- vaf.va = &args; \
- \
- __xfs_printk(kern_level, mp, &vaf); \
- va_end(args); \
- \
- if (!kstrtoint(kern_level, 0, &level) && \
- level <= LOGLEVEL_ERR && \
- xfs_error_level >= XFS_ERRLEVEL_HIGH) \
- xfs_stack_trace(); \
-} \
-
-define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-define_xfs_printk_level(xfs_alert, KERN_ALERT);
-define_xfs_printk_level(xfs_crit, KERN_CRIT);
-define_xfs_printk_level(xfs_err, KERN_ERR);
-define_xfs_printk_level(xfs_warn, KERN_WARNING);
-define_xfs_printk_level(xfs_notice, KERN_NOTICE);
-define_xfs_printk_level(xfs_info, KERN_INFO);
-#ifdef DEBUG
-define_xfs_printk_level(xfs_debug, KERN_DEBUG);
-#endif
+void
+xfs_printk_level(
+ const char *kern_level,
+ const struct xfs_mount *mp,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int level;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ __xfs_printk(kern_level, mp, &vaf);
+
+ va_end(args);
+
+ if (!kstrtoint(kern_level, 0, &level) &&
+ level <= LOGLEVEL_ERR &&
+ xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
void
-xfs_alert_tag(
+_xfs_alert_tag(
const struct xfs_mount *mp,
- int panic_tag,
+ uint32_t panic_tag,
const char *fmt, ...)
{
struct va_format vaf;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index bb9860ec9a93..cc323775a12c 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -6,33 +6,46 @@
struct xfs_mount;
-extern __printf(2, 3)
-void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
extern __printf(3, 4)
-void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
+void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp,
+ const char *fmt, ...);
+#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \
+ xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \
+})
+#define xfs_emerg(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__)
+#define xfs_alert(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__)
+#define xfs_crit(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__)
+#define xfs_err(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__)
+#define xfs_warn(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__)
+#define xfs_notice(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__)
+#define xfs_info(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__)
#ifdef DEBUG
-extern __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
+#define xfs_debug(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__)
#else
-static inline __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-{
-}
+#define xfs_debug(mp, fmt, ...) do {} while (0)
#endif
+#define xfs_alert_tag(mp, tag, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \
+ _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \
+})
+
+extern __printf(3, 4)
+void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag,
+ const char *fmt, ...);
+
#define xfs_printk_ratelimited(func, dev, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
@@ -62,6 +75,12 @@ do { \
#define xfs_debug_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_mount(mp, warntag, fmt, ...) \
+do { \
+ if (xfs_should_warn((mp), (warntag))) \
+ xfs_warn((mp), (fmt), ##__VA_ARGS__); \
+} while (0)
+
#define xfs_warn_once(dev, fmt, ...) \
xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__)
#define xfs_notice_once(dev, fmt, ...) \
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c5f153c3693f..f10c88cee116 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -468,6 +468,8 @@ STATIC int
xfs_check_summary_counts(
struct xfs_mount *mp)
{
+ int error = 0;
+
/*
* The AG0 superblock verifier rejects in-progress filesystems,
* so we should never see the flag set this far into mounting.
@@ -506,11 +508,32 @@ xfs_check_summary_counts(
* superblock to be correct and we don't need to do anything here.
* Otherwise, recalculate the summary counters.
*/
- if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) &&
- !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
- return 0;
+ if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
+ xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
+ error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ if (error)
+ return error;
+ }
- return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ /*
+ * Older kernels misused sb_frextents to reflect both incore
+ * reservations made by running transactions and the actual count of
+ * free rt extents in the ondisk metadata. Transactions committed
+ * during runtime can therefore contain a superblock update that
+ * undercounts the number of free rt extents tracked in the rt bitmap.
+ * A clean unmount record will have the correct frextents value since
+ * there can be no other transactions running at that point.
+ *
+ * If we're mounting the rt volume after recovering the log, recompute
+ * frextents from the rtbitmap file to fix the inconsistency.
+ */
+ if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ error = xfs_rtalloc_reinit_frextents(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
}
/*
@@ -755,7 +778,8 @@ xfs_mountfs(
/*
* Allocate and initialize the per-ag data.
*/
- error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
+ &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
goto out_free_dir;
@@ -784,11 +808,6 @@ xfs_mountfs(
goto out_inodegc_shrinker;
}
- /* Make sure the summary counts are ok. */
- error = xfs_check_summary_counts(mp);
- if (error)
- goto out_log_dealloc;
-
/* Enable background inode inactivation workers. */
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
@@ -844,6 +863,11 @@ xfs_mountfs(
goto out_rele_rip;
}
+ /* Make sure the summary counts are ok. */
+ error = xfs_check_summary_counts(mp);
+ if (error)
+ goto out_rtunmount;
+
/*
* If this is a read-only mount defer the superblock updates until
* the next remount into writeable mode. Otherwise we would never
@@ -1087,24 +1111,33 @@ xfs_fs_writable(
return true;
}
+/* Adjust m_fdblocks or m_frextents. */
int
-xfs_mod_fdblocks(
+xfs_mod_freecounter(
struct xfs_mount *mp,
+ struct percpu_counter *counter,
int64_t delta,
bool rsvd)
{
int64_t lcounter;
long long res_used;
+ uint64_t set_aside = 0;
s32 batch;
- uint64_t set_aside;
+ bool has_resv_pool;
+
+ ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+ has_resv_pool = (counter == &mp->m_fdblocks);
+ if (rsvd)
+ ASSERT(has_resv_pool);
if (delta > 0) {
/*
* If the reserve pool is depleted, put blocks back into it
* first. Most of the time the pool is full.
*/
- if (likely(mp->m_resblks == mp->m_resblks_avail)) {
- percpu_counter_add(&mp->m_fdblocks, delta);
+ if (likely(!has_resv_pool ||
+ mp->m_resblks == mp->m_resblks_avail)) {
+ percpu_counter_add(counter, delta);
return 0;
}
@@ -1116,7 +1149,7 @@ xfs_mod_fdblocks(
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(&mp->m_fdblocks, delta);
+ percpu_counter_add(counter, delta);
}
spin_unlock(&mp->m_sb_lock);
return 0;
@@ -1130,7 +1163,7 @@ xfs_mod_fdblocks(
* then make everything serialise as we are real close to
* ENOSPC.
*/
- if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+ if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@@ -1147,9 +1180,10 @@ xfs_mod_fdblocks(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
- if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+ if (has_resv_pool)
+ set_aside = xfs_fdblocks_unavailable(mp);
+ percpu_counter_add_batch(counter, delta, batch);
+ if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
@@ -1160,8 +1194,8 @@ xfs_mod_fdblocks(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- percpu_counter_add(&mp->m_fdblocks, -delta);
- if (!rsvd)
+ percpu_counter_add(counter, -delta);
+ if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1178,24 +1212,6 @@ fdblocks_enospc:
return -ENOSPC;
}
-int
-xfs_mod_frextents(
- struct xfs_mount *mp,
- int64_t delta)
-{
- int64_t lcounter;
- int ret = 0;
-
- spin_lock(&mp->m_sb_lock);
- lcounter = mp->m_sb.sb_frextents + delta;
- if (lcounter < 0)
- ret = -ENOSPC;
- else
- mp->m_sb.sb_frextents = lcounter;
- spin_unlock(&mp->m_sb_lock);
- return ret;
-}
-
/*
* Used to free the superblock along various error paths.
*/
@@ -1341,7 +1357,6 @@ xfs_clear_incompat_log_features(
if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
- xfs_info(mp, "Clearing log incompat feature flags.");
xfs_sb_remove_incompat_log_features(&mp->m_sb);
ret = true;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f6dc19de8322..8aca2cc173ac 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -61,7 +61,7 @@ struct xfs_error_cfg {
*/
struct xfs_inodegc {
struct llist_head list;
- struct work_struct work;
+ struct delayed_work work;
/* approximate count of inodes in the list */
unsigned int items;
@@ -183,6 +183,8 @@ typedef struct xfs_mount {
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
+ struct percpu_counter m_frextents; /* free rt extent counter */
+
/*
* Count of data device blocks reserved for delayed allocations,
* including indlen blocks. Does not include allocated CoW staging
@@ -276,6 +278,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
+#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME)
__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(large_extent_counts, NREXT64)
/*
* Mount features
@@ -387,6 +391,13 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
*/
#define XFS_OPSTATE_BLOCKGC_ENABLED 6
+/* Kernel has logged a warning about online fsck being used on this fs. */
+#define XFS_OPSTATE_WARNED_SCRUB 7
+/* Kernel has logged a warning about shrink being used on this fs. */
+#define XFS_OPSTATE_WARNED_SHRINK 8
+/* Kernel has logged a warning about logged xattr updates being used. */
+#define XFS_OPSTATE_WARNED_LARP 9
+
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
{ \
@@ -409,6 +420,12 @@ __XFS_IS_OPSTATE(readonly, READONLY)
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+static inline bool
+xfs_should_warn(struct xfs_mount *mp, long nr)
+{
+ return !test_and_set_bit(nr, &mp->m_opstate);
+}
+
#define XFS_OPSTATE_STRINGS \
{ (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \
{ (1UL << XFS_OPSTATE_CLEAN), "clean" }, \
@@ -416,7 +433,10 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
{ (1UL << XFS_OPSTATE_INODE32), "inode32" }, \
{ (1UL << XFS_OPSTATE_READONLY), "read_only" }, \
{ (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \
- { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }
+ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
+ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
+ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
+ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }
/*
* Max and min values for mount-option defined I/O
@@ -425,16 +445,16 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-#define xfs_is_shutdown(mp) xfs_is_shutdown(mp)
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
int lnnum);
#define xfs_force_shutdown(m,f) \
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
-#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */
+#define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */
#define XFS_SHUTDOWN_STRINGS \
{ SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
@@ -494,9 +514,20 @@ xfs_fdblocks_unavailable(
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
-extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
- bool reserved);
-extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ int64_t delta, bool rsvd);
+
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+ return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+ return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
new file mode 100644
index 000000000000..5b1f9a24ed59
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Fujitsu. All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+
+#include <linux/mm.h>
+#include <linux/dax.h>
+
+struct failure_info {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+ int mf_flags;
+};
+
+static pgoff_t
+xfs_failure_pgoff(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rec,
+ const struct failure_info *notify)
+{
+ loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
+
+ if (notify->startblock > rec->rm_startblock)
+ pos += XFS_FSB_TO_B(mp,
+ notify->startblock - rec->rm_startblock);
+ return pos >> PAGE_SHIFT;
+}
+
+static unsigned long
+xfs_failure_pgcnt(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rec,
+ const struct failure_info *notify)
+{
+ xfs_agblock_t end_rec;
+ xfs_agblock_t end_notify;
+ xfs_agblock_t start_cross;
+ xfs_agblock_t end_cross;
+
+ start_cross = max(rec->rm_startblock, notify->startblock);
+
+ end_rec = rec->rm_startblock + rec->rm_blockcount;
+ end_notify = notify->startblock + notify->blockcount;
+ end_cross = min(end_rec, end_notify);
+
+ return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
+}
+
+static int
+xfs_dax_failure_fn(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *data)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip;
+ struct failure_info *notify = data;
+ int error = 0;
+
+ if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
+ (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+
+ /* Get files that incore, filter out others that are not in use. */
+ error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+ 0, &ip);
+ /* Continue the rmap query if the inode isn't incore */
+ if (error == -ENODATA)
+ return 0;
+ if (error)
+ return error;
+
+ error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
+ xfs_failure_pgoff(mp, rec, notify),
+ xfs_failure_pgcnt(mp, rec, notify),
+ notify->mf_flags);
+ xfs_irele(ip);
+ return error;
+}
+
+static int
+xfs_dax_notify_ddev_failure(
+ struct xfs_mount *mp,
+ xfs_daddr_t daddr,
+ xfs_daddr_t bblen,
+ int mf_flags)
+{
+ struct xfs_trans *tp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_buf *agf_bp = NULL;
+ int error = 0;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
+ xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
+ xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ for (; agno <= end_agno; agno++) {
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct failure_info notify;
+ struct xfs_agf *agf;
+ xfs_agblock_t agend;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+ if (error) {
+ xfs_perag_put(pag);
+ break;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
+
+ /*
+ * Set the rmap range from ri_low to ri_high, which represents
+ * a [start, end] where we looking for the files or metadata.
+ */
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+ ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
+ if (agno == end_agno)
+ ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+
+ agf = agf_bp->b_addr;
+ agend = min(be32_to_cpu(agf->agf_length),
+ ri_high.rm_startblock);
+ notify.startblock = ri_low.rm_startblock;
+ notify.blockcount = agend - ri_low.rm_startblock;
+
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_dax_failure_fn, &notify);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(tp, agf_bp);
+ xfs_perag_put(pag);
+ if (error)
+ break;
+
+ fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
+ }
+
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+static int
+xfs_dax_notify_failure(
+ struct dax_device *dax_dev,
+ u64 offset,
+ u64 len,
+ int mf_flags)
+{
+ struct xfs_mount *mp = dax_holder(dax_dev);
+ u64 ddev_start;
+ u64 ddev_end;
+
+ if (!(mp->m_super->s_flags & SB_BORN)) {
+ xfs_warn(mp, "filesystem is not ready for notify_failure()!");
+ return -EIO;
+ }
+
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
+ xfs_debug(mp,
+ "notify_failure() not supported on realtime device!");
+ return -EOPNOTSUPP;
+ }
+
+ if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
+ mp->m_logdev_targp != mp->m_ddev_targp) {
+ xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+
+ if (!xfs_has_rmapbt(mp)) {
+ xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
+ return -EOPNOTSUPP;
+ }
+
+ ddev_start = mp->m_ddev_targp->bt_dax_part_off;
+ ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+
+ /* Ignore the range out of filesystem area */
+ if (offset + len < ddev_start)
+ return -ENXIO;
+ if (offset > ddev_end)
+ return -ENXIO;
+
+ /* Calculate the real range when it touches the boundary */
+ if (offset > ddev_start)
+ offset -= ddev_start;
+ else {
+ len -= ddev_start - offset;
+ offset = 0;
+ }
+ if (offset + len > ddev_end)
+ len -= ddev_end - offset;
+
+ return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
+ mf_flags);
+}
+
+const struct dax_holder_operations xfs_dax_holder_operations = {
+ .notify_failure = xfs_dax_notify_failure,
+};
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 25991923c1a8..758702b9495f 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16);
/*
* The v5 superblock format extended several v4 header structures with
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f165d1a3de1d..18bb4ec4d7c9 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -582,9 +582,6 @@ xfs_qm_init_timelimits(
defq->blk.time = XFS_QM_BTIMELIMIT;
defq->ino.time = XFS_QM_ITIMELIMIT;
defq->rtb.time = XFS_QM_RTBTIMELIMIT;
- defq->blk.warn = XFS_QM_BWARNLIMIT;
- defq->ino.warn = XFS_QM_IWARNLIMIT;
- defq->rtb.warn = XFS_QM_RTBWARNLIMIT;
/*
* We try to get the limits from the superuser's limits fields.
@@ -608,12 +605,6 @@ xfs_qm_init_timelimits(
defq->ino.time = dqp->q_ino.timer;
if (dqp->q_rtb.timer)
defq->rtb.time = dqp->q_rtb.timer;
- if (dqp->q_blk.warnings)
- defq->blk.warn = dqp->q_blk.warnings;
- if (dqp->q_ino.warnings)
- defq->ino.warn = dqp->q_ino.warnings;
- if (dqp->q_rtb.warnings)
- defq->rtb.warn = dqp->q_rtb.warnings;
xfs_qm_dqdestroy(dqp);
}
@@ -686,7 +677,8 @@ xfs_qm_init_quotainfo(
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
- error = register_shrinker(&qinf->qi_shrinker);
+ error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s",
+ mp->m_super->s_id);
if (error)
goto out_free_inos;
@@ -1163,7 +1155,7 @@ xfs_qm_dqusage_adjust(
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
@@ -1238,10 +1230,14 @@ xfs_qm_flush_one(
*/
if (!xfs_dqflock_nowait(dqp)) {
/* buf is pinned in-core by delwri list */
- bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0);
- if (!bp) {
- error = -EINVAL;
+ error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ goto out_unlock;
+
+ if (!(bp->b_flags & _XBF_DELWRI_Q)) {
+ error = -EAGAIN;
+ xfs_buf_relse(bp);
goto out_unlock;
}
xfs_buf_unlock(bp);
@@ -1317,8 +1313,15 @@ xfs_qm_quotacheck(
error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
NULL);
- if (error)
+ if (error) {
+ /*
+ * The inode walk may have partially populated the dquot
+ * caches. We must purge them before disabling quota and
+ * tearing down the quotainfo, or else the dquots will leak.
+ */
+ xfs_qm_dqpurge_all(mp);
goto error_return;
+ }
/*
* We've made all the changes that we need to make incore. Flush them
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5bb12717ea28..9683f0457d19 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -34,7 +34,6 @@ struct xfs_quota_limits {
xfs_qcnt_t hard; /* default hard limit */
xfs_qcnt_t soft; /* default soft limit */
time64_t time; /* limit for timers */
- xfs_qwarncnt_t warn; /* limit for warnings */
};
/* Defaults for each quota type: time limits, warn limits, usage limits */
@@ -134,10 +133,6 @@ struct xfs_dquot_acct {
#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
-#define XFS_QM_BWARNLIMIT 5
-#define XFS_QM_IWARNLIMIT 5
-#define XFS_QM_RTBWARNLIMIT 5
-
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
/* quota ops */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 7d5a31827681..392cb39cc10c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -217,8 +217,7 @@ xfs_qm_scall_quotaon(
return 0;
}
-#define XFS_QC_MASK \
- (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK)
/*
* Adjust limits of this quota, and the defaults if passed in. Returns true
@@ -251,17 +250,6 @@ xfs_setqlim_limits(
}
static inline void
-xfs_setqlim_warns(
- struct xfs_dquot_res *res,
- struct xfs_quota_limits *qlim,
- int warns)
-{
- res->warnings = warns;
- if (qlim)
- qlim->warn = warns;
-}
-
-static inline void
xfs_setqlim_timer(
struct xfs_mount *mp,
struct xfs_dquot_res *res,
@@ -354,8 +342,6 @@ xfs_qm_scall_setqlim(
if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
xfs_dquot_set_prealloc_limits(dqp);
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
if (newlim->d_fieldmask & QC_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
@@ -370,8 +356,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->rtb : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
@@ -386,8 +370,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->ino : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
- if (newlim->d_fieldmask & QC_INO_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
if (newlim->d_fieldmask & QC_INO_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
@@ -428,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc(
dst->d_ino_count = dqp->q_ino.reserved;
dst->d_spc_timer = dqp->q_blk.timer;
dst->d_ino_timer = dqp->q_ino.timer;
- dst->d_ino_warns = dqp->q_ino.warnings;
- dst->d_spc_warns = dqp->q_blk.warnings;
+ dst->d_ino_warns = 0;
+ dst->d_spc_warns = 0;
dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
dst->d_rt_spc_timer = dqp->q_rtb.timer;
- dst->d_rt_spc_warns = dqp->q_rtb.warnings;
+ dst->d_rt_spc_warns = 0;
/*
* Internally, we don't reset all the timers when quota enforcement
@@ -472,9 +454,12 @@ xfs_qm_scall_getquota(
struct xfs_dquot *dqp;
int error;
- /* Flush inodegc work at the start of a quota reporting scan. */
+ /*
+ * Expedite pending inodegc work at the start of a quota reporting
+ * scan but don't block waiting for it to complete.
+ */
if (id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
/*
* Try to get the dquot. We don't want it allocated on disk, so don't
@@ -516,7 +501,7 @@ xfs_qm_scall_getquota_next(
/* Flush inodegc work at the start of a quota reporting scan. */
if (*id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error)
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 07989bd67728..9c162e69976b 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -40,9 +40,9 @@ xfs_qm_fill_state(
tstate->spc_timelimit = (u32)defq->blk.time;
tstate->ino_timelimit = (u32)defq->ino.time;
tstate->rt_spc_timelimit = (u32)defq->rtb.time;
- tstate->spc_warnlimit = defq->blk.warn;
- tstate->ino_warnlimit = defq->ino.warn;
- tstate->rt_spc_warnlimit = defq->rtb.warn;
+ tstate->spc_warnlimit = 0;
+ tstate->ino_warnlimit = 0;
+ tstate->rt_spc_warnlimit = 0;
if (tempqip)
xfs_irele(ip);
}
@@ -98,7 +98,7 @@ xfs_quota_type(int type)
}
}
-#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK)
/*
* Adjust quota timers & warnings
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0d868c93144d..7e97bf19793d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -53,10 +54,11 @@ xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
ASSERT(atomic_read(&cuip->cui_refcount) > 0);
- if (atomic_dec_and_test(&cuip->cui_refcount)) {
- xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_cui_item_free(cuip);
- }
+ if (!atomic_dec_and_test(&cuip->cui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&cuip->cui_item, 0);
+ xfs_cui_item_free(cuip);
}
@@ -204,14 +206,24 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_cache, cudp);
}
+static struct xfs_log_item *
+xfs_cud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &CUD_ITEM(lip)->cud_cuip->cui_item;
+}
+
static const struct xfs_item_ops xfs_cud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_cud_item_size,
.iop_format = xfs_cud_item_format,
.iop_release = xfs_cud_item_release,
+ .iop_intent = xfs_cud_item_intent,
};
static struct xfs_cud_log_item *
@@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update(
* 1.) releases the CUI and frees the CUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
return error;
@@ -600,6 +612,7 @@ xfs_cui_item_relog(
}
static const struct xfs_item_ops xfs_cui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
.iop_unpin = xfs_cui_item_unpin,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 54e68e5693fd..251f20ddd368 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -125,11 +125,10 @@
* shared blocks. If there are no shared extents, fbno and flen will
* be set to NULLAGBLOCK and 0, respectively.
*/
-int
+static int
xfs_reflink_find_shared(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_extlen_t aglen,
xfs_agblock_t *fbno,
@@ -140,11 +139,11 @@ xfs_reflink_find_shared(
struct xfs_btree_cur *cur;
int error;
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag);
+ cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
@@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared(
struct xfs_bmbt_irec *irec,
bool *shared)
{
- xfs_agnumber_t agno;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
xfs_agblock_t agbno;
xfs_extlen_t aglen;
xfs_agblock_t fbno;
@@ -186,12 +186,13 @@ xfs_reflink_trim_around_shared(
trace_xfs_reflink_trim_around_shared(ip, irec);
- agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
- agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+ agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
aglen = irec->br_blockcount;
- error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno,
- aglen, &fbno, &flen, true);
+ error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
+ true);
+ xfs_perag_put(pag);
if (error)
return error;
@@ -340,9 +341,41 @@ xfs_find_trim_cow_extent(
return 0;
}
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-int
-xfs_reflink_allocate_cow(
+static int
+xfs_reflink_convert_unwritten(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool convert_now)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ int error;
+
+ /*
+ * cmap might larger than imap due to cowextsize hint.
+ */
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
+
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
+ return 0;
+
+ trace_xfs_reflink_convert_cow(ip, cmap);
+
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+
+ return error;
+}
+
+static int
+xfs_reflink_fill_cow_hole(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
@@ -351,25 +384,12 @@ xfs_reflink_allocate_cow(
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = imap->br_startoff;
- xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_trans *tp;
- int nimaps, error = 0;
- bool found;
xfs_filblks_t resaligned;
- xfs_extlen_t resblks = 0;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
-
- error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
- return error;
- if (found)
- goto convert;
+ xfs_extlen_t resblks;
+ int nimaps;
+ int error;
+ bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
@@ -385,17 +405,17 @@ xfs_reflink_allocate_cow(
*lockmode = XFS_ILOCK_EXCL;
- /*
- * Check for an overlapping extent again now that we dropped the ilock.
- */
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
+
if (found) {
xfs_trans_cancel(tp);
goto convert;
}
+ ASSERT(cmap->br_startoff > imap->br_startoff);
+
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
@@ -415,26 +435,135 @@ xfs_reflink_allocate_cow(
*/
if (nimaps == 0)
return -ENOSPC;
+
convert:
- xfs_trim_extent(cmap, offset_fsb, count_fsb);
- /*
- * COW fork extents are supposed to remain unwritten until we're ready
- * to initiate a disk write. For direct I/O we are going to write the
- * data and need the conversion, but for buffered writes we're done.
- */
- if (!convert_now || cmap->br_state == XFS_EXT_NORM)
- return 0;
- trace_xfs_reflink_convert_cow(ip, cmap);
- error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
- if (!error)
- cmap->br_state = XFS_EXT_NORM;
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
+}
+
+static int
+xfs_reflink_fill_delalloc(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int nimaps;
+ int error;
+ bool found;
+
+ do {
+ xfs_iunlock(ip, *lockmode);
+ *lockmode = 0;
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
+ false, &tp);
+ if (error)
+ return error;
+
+ *lockmode = XFS_ILOCK_EXCL;
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
+ &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+
+ if (found) {
+ xfs_trans_cancel(tp);
+ break;
+ }
+
+ ASSERT(isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK);
+
+ /*
+ * Replace delalloc reservation with an unwritten extent.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
+ cmap->br_blockcount,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
+ cmap, &nimaps);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ return error;
+
+ /*
+ * Allocation succeeded but the requested range was not even
+ * partially satisfied? Bail out!
+ */
+ if (nimaps == 0)
+ return -ENOSPC;
+ } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
+
+ return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+int
+xfs_reflink_allocate_cow(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
+ bool *shared,
+ uint *lockmode,
+ bool convert_now)
+{
+ int error;
+ bool found;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
+ if (error || !*shared)
+ return error;
+
+ /* CoW fork has a real extent */
+ if (found)
+ return xfs_reflink_convert_unwritten(ip, imap, cmap,
+ convert_now);
+
+ /*
+ * CoW fork does not have an extent and data extent is shared.
+ * Allocate a real extent in the CoW fork.
+ */
+ if (cmap->br_startoff > imap->br_startoff)
+ return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /*
+ * CoW fork has a delalloc reservation. Replace it with a real extent.
+ * There may or may not be a data fork mapping.
+ */
+ if (isnullstartblock(cmap->br_startblock) ||
+ cmap->br_startblock == DELAYSTARTBLOCK)
+ return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
+ lockmode, convert_now);
+
+ /* Shouldn't get here. */
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
/*
* Cancel CoW reservations for some block range of an inode.
*
@@ -452,7 +581,7 @@ xfs_reflink_cancel_cow_blocks(
xfs_fileoff_t end_fsb,
bool cancel_real)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
@@ -586,21 +715,21 @@ out:
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- xfs_fileoff_t *end_fsb)
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
{
- struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- xfs_filblks_t rlen;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
+ int nmaps;
int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
- *end_fsb = offset_fsb;
+ *offset_fsb = end_fsb;
return 0;
}
@@ -620,6 +749,9 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
goto out_cancel;
@@ -628,42 +760,66 @@ xfs_reflink_end_cow_extent(
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
- got.br_startoff + got.br_blockcount <= offset_fsb) {
- *end_fsb = offset_fsb;
+ if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
goto out_cancel;
}
/*
- * Structure copy @got into @del, then trim @del to the range that we
- * were asked to remap. We preserve @got for the eventual CoW fork
- * deletion; from now on @del represents the mapping that we're
- * actually remapping.
- */
- del = got;
- xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
-
- ASSERT(del.br_blockcount > 0);
-
- /*
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
- * need to skip them.
+ * need to skip them. Preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
*/
- if (!xfs_bmap_is_written_extent(&got)) {
- *end_fsb = del.br_startoff;
- goto out_cancel;
+ while (!xfs_bmap_is_written_extent(&got)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
+ goto out_cancel;
+ }
}
+ del = got;
- /* Unmap the old blocks in the data fork. */
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ /* Grab the corresponding mapping in the data fork. */
+ nmaps = 1;
+ error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+ &nmaps, 0);
if (error)
goto out_cancel;
- /* Trim the extent to whatever got unmapped. */
- xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
- trace_xfs_reflink_cow_remap(ip, &del);
+ /* We can only remap the smaller of the two extent sizes. */
+ data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+ del.br_blockcount = data.br_blockcount;
+
+ trace_xfs_reflink_cow_remap_from(ip, &del);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ if (xfs_bmap_is_real_extent(&data)) {
+ /*
+ * If the extent we're remapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_refcount_decrease_extent(tp, &data);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -data.br_blockcount);
+ } else if (data.br_startblock == DELAYSTARTBLOCK) {
+ int done;
+
+ /*
+ * If the extent we're remapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, data.br_startoff,
+ data.br_blockcount, 0, 1, &done);
+ if (error)
+ goto out_cancel;
+ ASSERT(done);
+ }
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
@@ -684,7 +840,7 @@ xfs_reflink_end_cow_extent(
return error;
/* Update the caller about how much progress we made. */
- *end_fsb = del.br_startoff;
+ *offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
out_cancel:
@@ -712,7 +868,7 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
- * Walk backwards until we're out of the I/O range. The loop function
+ * Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
@@ -744,7 +900,7 @@ xfs_reflink_end_cow(
* blocks will be remapped.
*/
while (end_fsb > offset_fsb && !error)
- error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+ error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
@@ -1121,6 +1277,8 @@ xfs_reflink_remap_extent(
++iext_delta;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto out_cancel;
@@ -1133,7 +1291,7 @@ xfs_reflink_remap_extent(
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
- xfs_filblks_t len = smap.br_blockcount;
+ int done;
/*
* If the extent we're unmapping is a delalloc reservation,
@@ -1141,10 +1299,11 @@ xfs_reflink_remap_extent(
* incore state. Dropping the delalloc reservation takes care
* of the quota reservation for us.
*/
- error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+ error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+ smap.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
- ASSERT(len == 0);
+ ASSERT(done);
}
/*
@@ -1333,12 +1492,16 @@ xfs_reflink_remap_prep(
if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
goto out_unlock;
- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ /* Don't share DAX file data with non-DAX file. */
+ if (IS_DAX(inode_in) != IS_DAX(inode_out))
goto out_unlock;
- ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
+ if (!IS_DAX(inode_in))
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags);
+ else
+ ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
+ pos_out, len, remap_flags, &xfs_read_iomap_ops);
if (ret || *len == 0)
goto out_unlock;
@@ -1390,16 +1553,11 @@ xfs_reflink_inode_has_shared_extents(
struct xfs_bmbt_irec got;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
struct xfs_iext_cursor icur;
bool found;
int error;
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
return error;
@@ -1407,17 +1565,25 @@ xfs_reflink_inode_has_shared_extents(
*has_shared = false;
found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
while (found) {
+ struct xfs_perag *pag;
+ xfs_agblock_t agbno;
+ xfs_extlen_t aglen;
+ xfs_agblock_t rbno;
+ xfs_extlen_t rlen;
+
if (isnullstartblock(got.br_startblock) ||
got.br_state != XFS_EXT_NORM)
goto next;
- agno = XFS_FSB_TO_AGNO(mp, got.br_startblock);
+
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
aglen = got.br_blockcount;
-
- error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen,
+ error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
&rbno, &rlen, false);
+ xfs_perag_put(pag);
if (error)
return error;
+
/* Is there still a shared block here? */
if (rbno != NULLAGBLOCK) {
*has_shared = true;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index bea65f2fe657..65c5dfe17ecf 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -16,9 +16,6 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
}
-extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
- xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a22b2d19ef91..fef92e02f3bb 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -53,10 +54,11 @@ xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
ASSERT(atomic_read(&ruip->rui_refcount) > 0);
- if (atomic_dec_and_test(&ruip->rui_refcount)) {
- xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_rui_item_free(ruip);
- }
+ if (!atomic_dec_and_test(&ruip->rui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&ruip->rui_item, 0);
+ xfs_rui_item_free(ruip);
}
STATIC void
@@ -227,14 +229,24 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_cache, rudp);
}
+static struct xfs_log_item *
+xfs_rud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &RUD_ITEM(lip)->rud_ruip->rui_item;
+}
+
static const struct xfs_item_ops xfs_rud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_rud_item_size,
.iop_format = xfs_rud_item_format,
.iop_release = xfs_rud_item_release,
+ .iop_intent = xfs_rud_item_intent,
};
static struct xfs_rud_log_item *
@@ -327,7 +339,7 @@ xfs_trans_log_finish_rmap_update(
* 1.) releases the RUI and frees the RUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
return error;
@@ -630,6 +642,7 @@ xfs_rui_item_relog(
}
static const struct xfs_item_ops xfs_rui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
.iop_unpin = xfs_rui_item_unpin,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b8c79ee791af..292d5e54a92c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -806,6 +806,9 @@ xfs_growfs_rt_alloc(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -1284,6 +1287,44 @@ xfs_rtmount_init(
return 0;
}
+static int
+xfs_rtalloc_count_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ uint64_t *valp = priv;
+
+ *valp += rec->ar_extcount;
+ return 0;
+}
+
+/*
+ * Reinitialize the number of free realtime extents from the realtime bitmap.
+ * Callers must ensure that there is no other activity in the filesystem.
+ */
+int
+xfs_rtalloc_reinit_frextents(
+ struct xfs_mount *mp)
+{
+ uint64_t val = 0;
+ int error;
+
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
+ &val);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_frextents = val;
+ spin_unlock(&mp->m_sb_lock);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ return 0;
+}
+
/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 91b00289509b..62c7ad79cbb6 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -22,6 +22,7 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv);
@@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_trans *tp,
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
bool *is_free);
+int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
# define xfs_verify_rtbno(m, r) (false)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 54be9d64093e..f029c6702dda 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -38,6 +38,9 @@
#include "xfs_pwork.h"
#include "xfs_ag.h"
#include "xfs_defer.h"
+#include "xfs_attr_item.h"
+#include "xfs_xattr.h"
+#include "xfs_iunlink_item.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -348,8 +351,10 @@ xfs_setup_dax_always(
goto disable_dax;
}
- if (xfs_has_reflink(mp)) {
- xfs_alert(mp, "DAX and reflink cannot be used together!");
+ if (xfs_has_reflink(mp) &&
+ bdev_is_partition(mp->m_ddev_targp->bt_bdev)) {
+ xfs_alert(mp,
+ "DAX and reflink cannot work with multi-partitions!");
return -EINVAL;
}
@@ -648,7 +653,7 @@ xfs_fs_destroy_inode(
static void
xfs_fs_dirty_inode(
struct inode *inode,
- int flag)
+ int flags)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -656,7 +661,13 @@ xfs_fs_dirty_inode(
if (!(inode->i_sb->s_flags & SB_LAZYTIME))
return;
- if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME))
+
+ /*
+ * Only do the timestamp update if the inode is dirty (I_DIRTY_SYNC)
+ * and has dirty timestamp (I_DIRTY_TIME). I_DIRTY_TIME can be passed
+ * in flags possibly together with I_DIRTY_SYNC.
+ */
+ if ((flags & ~I_DIRTY_TIME) != I_DIRTY_SYNC || !(flags & I_DIRTY_TIME))
return;
if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp))
@@ -795,8 +806,11 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
- /* Wait for whatever inactivations are in progress. */
- xfs_inodegc_flush(mp);
+ /*
+ * Expedite background inodegc but don't wait. We do not want to block
+ * here waiting hours for a billion extent file to be truncated.
+ */
+ xfs_inodegc_push(mp);
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -843,9 +857,11 @@ xfs_fs_statfs(
if (XFS_IS_REALTIME_MOUNT(mp) &&
(ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+ s64 freertx;
+
statp->f_blocks = sbp->sb_rblocks;
- statp->f_bavail = statp->f_bfree =
- sbp->sb_frextents * sbp->sb_rextsize;
+ freertx = percpu_counter_sum_positive(&mp->m_frextents);
+ statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
}
return 0;
@@ -1015,8 +1031,14 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
+ error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ if (error)
+ goto free_delalloc;
+
return 0;
+free_delalloc:
+ percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
@@ -1033,6 +1055,7 @@ xfs_reinit_percpu_counters(
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
}
static void
@@ -1045,6 +1068,7 @@ xfs_destroy_percpu_counters(
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
+ percpu_counter_destroy(&mp->m_frextents);
}
static int
@@ -1062,7 +1086,7 @@ xfs_inodegc_init_percpu(
gc = per_cpu_ptr(mp->m_inodegc, cpu);
init_llist_head(&gc->list);
gc->items = 0;
- INIT_WORK(&gc->work, xfs_inodegc_worker);
+ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
}
return 0;
}
@@ -1608,14 +1632,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
- if (xfs_has_discard(mp)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
-
- if (!blk_queue_discard(q)) {
- xfs_warn(mp, "mounting with \"discard\" option, but "
- "the device does not support discard");
- mp->m_features &= ~XFS_FEAT_DISCARD;
- }
+ if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) {
+ xfs_warn(mp,
+ "mounting with \"discard\" option, but the device does not support discard");
+ mp->m_features &= ~XFS_FEAT_DISCARD;
}
if (xfs_has_reflink(mp)) {
@@ -1639,6 +1659,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_large_extent_counts(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1951,11 +1975,19 @@ xfs_init_caches(void)
{
int error;
+ xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD,
+ NULL);
+ if (!xfs_buf_cache)
+ goto out;
+
xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket",
sizeof(struct xlog_ticket),
0, 0, NULL);
if (!xfs_log_ticket_cache)
- goto out;
+ goto out_destroy_buf_cache;
error = xfs_btree_init_cur_caches();
if (error)
@@ -2069,8 +2101,32 @@ xfs_init_caches(void)
if (!xfs_bui_cache)
goto out_destroy_bud_cache;
+ xfs_attrd_cache = kmem_cache_create("xfs_attrd_item",
+ sizeof(struct xfs_attrd_log_item),
+ 0, 0, NULL);
+ if (!xfs_attrd_cache)
+ goto out_destroy_bui_cache;
+
+ xfs_attri_cache = kmem_cache_create("xfs_attri_item",
+ sizeof(struct xfs_attri_log_item),
+ 0, 0, NULL);
+ if (!xfs_attri_cache)
+ goto out_destroy_attrd_cache;
+
+ xfs_iunlink_cache = kmem_cache_create("xfs_iul_item",
+ sizeof(struct xfs_iunlink_item),
+ 0, 0, NULL);
+ if (!xfs_iunlink_cache)
+ goto out_destroy_attri_cache;
+
return 0;
+ out_destroy_attri_cache:
+ kmem_cache_destroy(xfs_attri_cache);
+ out_destroy_attrd_cache:
+ kmem_cache_destroy(xfs_attrd_cache);
+ out_destroy_bui_cache:
+ kmem_cache_destroy(xfs_bui_cache);
out_destroy_bud_cache:
kmem_cache_destroy(xfs_bud_cache);
out_destroy_cui_cache:
@@ -2105,6 +2161,8 @@ xfs_init_caches(void)
xfs_btree_destroy_cur_caches();
out_destroy_log_ticket_cache:
kmem_cache_destroy(xfs_log_ticket_cache);
+ out_destroy_buf_cache:
+ kmem_cache_destroy(xfs_buf_cache);
out:
return -ENOMEM;
}
@@ -2117,6 +2175,9 @@ xfs_destroy_caches(void)
* destroy caches.
*/
rcu_barrier();
+ kmem_cache_destroy(xfs_iunlink_cache);
+ kmem_cache_destroy(xfs_attri_cache);
+ kmem_cache_destroy(xfs_attrd_cache);
kmem_cache_destroy(xfs_bui_cache);
kmem_cache_destroy(xfs_bud_cache);
kmem_cache_destroy(xfs_cui_cache);
@@ -2135,6 +2196,7 @@ xfs_destroy_caches(void)
xfs_defer_destroy_item_caches();
xfs_btree_destroy_cur_caches();
kmem_cache_destroy(xfs_log_ticket_cache);
+ kmem_cache_destroy(xfs_buf_cache);
}
STATIC int __init
@@ -2180,6 +2242,7 @@ xfs_cpu_dead(
list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
spin_unlock(&xfs_mount_list_lock);
xfs_inodegc_cpu_dead(mp, cpu);
+ xlog_cil_pcp_dead(mp->m_log, cpu);
spin_lock(&xfs_mount_list_lock);
}
spin_unlock(&xfs_mount_list_lock);
@@ -2239,13 +2302,9 @@ init_xfs_fs(void)
if (error)
goto out_destroy_wq;
- error = xfs_buf_init();
- if (error)
- goto out_mru_cache_uninit;
-
error = xfs_init_procfs();
if (error)
- goto out_buf_terminate;
+ goto out_mru_cache_uninit;
error = xfs_sysctl_register();
if (error)
@@ -2302,8 +2361,6 @@ init_xfs_fs(void)
xfs_sysctl_unregister();
out_cleanup_procfs:
xfs_cleanup_procfs();
- out_buf_terminate:
- xfs_buf_terminate();
out_mru_cache_uninit:
xfs_mru_cache_uninit();
out_destroy_wq:
@@ -2329,7 +2386,6 @@ exit_xfs_fs(void)
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
- xfs_buf_terminate();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_caches();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 167d23f92ffe..364e2c2648a8 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -91,8 +91,8 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
xfs_agnumber_t agcount);
extern const struct export_operations xfs_export_operations;
-extern const struct xattr_handler *xfs_xattr_handlers[];
extern const struct quotactl_ops xfs_quotactl_operations;
+extern const struct dax_holder_operations xfs_dax_holder_operations;
extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index affbedf78160..8389f3ef88ef 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -226,11 +226,6 @@ xfs_symlink(
goto out_trans_cancel;
}
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* Allocate an inode for the symlink.
*/
@@ -261,7 +256,7 @@ xfs_symlink(
/*
* If the symlink will fit into the inode, write it inline.
*/
- if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+ if (pathlen <= xfs_inode_data_fork_size(ip)) {
xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
ip->i_disk_size = pathlen;
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 7692e76ead33..f78ad6b10ea5 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -83,6 +83,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
#ifdef DEBUG
int pwork_threads; /* parallel workqueue threads */
+ bool larp; /* log attribute replay */
#endif
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 574b80c29fe1..f7faf6e70d7f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -228,6 +228,29 @@ pwork_threads_show(
return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+
+static ssize_t
+larp_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.larp);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+STATIC ssize_t
+larp_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
+}
+XFS_SYSFS_ATTR_RW(larp);
#endif /* DEBUG */
static struct attribute *xfs_dbg_attrs[] = {
@@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(always_cow),
#ifdef DEBUG
ATTR_LIST(pwork_threads),
+ ATTR_LIST(larp),
#endif
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b141ef78c755..f9057af6e0c8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \
TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
TP_ARGS(mp, caller_ip))
DEFINE_FS_EVENT(xfs_inodegc_flush);
+DEFINE_FS_EVENT(xfs_inodegc_push);
DEFINE_FS_EVENT(xfs_inodegc_start);
DEFINE_FS_EVENT(xfs_inodegc_stop);
DEFINE_FS_EVENT(xfs_inodegc_queue);
@@ -418,6 +419,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__field(unsigned, lockval)
__field(unsigned, flags)
__field(unsigned long, caller_ip)
+ __field(const void *, buf_ops)
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
@@ -428,9 +430,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
+ __entry->buf_ops = bp->b_ops;
),
TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
- "lock %d flags %s caller %pS",
+ "lock %d flags %s bufops %pS caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->nblks,
@@ -438,6 +441,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->pincount,
__entry->lockval,
__print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ __entry->buf_ops,
(void *)__entry->caller_ip)
)
@@ -1096,22 +1100,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
-#define XFS_QMOPT_FLAGS \
- { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
- { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
- { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
- { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
- { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
- { XFS_QMOPT_INHERIT, "INHERIT" }, \
- { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
- { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
- { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
- { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
- { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
- { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
- { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
- { XFS_QMOPT_RES_INOS, "RES_INOS" }
-
TRACE_EVENT(xfs_trans_mod_dquot,
TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
unsigned int field, int64_t delta),
@@ -1348,6 +1336,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1924,7 +1915,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__field(int, namelen)
__field(xfs_dahash_t, hashval)
__field(xfs_ino_t, inumber)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1990,7 +1981,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(xfs_dahash_t, hashval)
__field(unsigned int, attr_filter)
__field(unsigned int, attr_flags)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -2097,7 +2088,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, idx)
),
TP_fast_assign(
@@ -2128,7 +2119,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, src_idx)
__field(int, dst_idx)
__field(int, count)
@@ -2169,7 +2160,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(int, which)
__field(xfs_ino_t, ino)
__field(int, format)
- __field(int, nex)
+ __field(xfs_extnum_t, nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -2180,9 +2171,9 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->format = ip->i_df.if_format;
__entry->nex = ip->i_df.if_nextents;
__entry->broot_size = ip->i_df.if_broot_bytes;
- __entry->fork_off = XFS_IFORK_BOFF(ip);
+ __entry->fork_off = xfs_inode_fork_boff(ip);
),
- TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+ TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
"broot size %d, forkoff 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -3418,7 +3409,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -3513,7 +3505,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
-TRACE_EVENT(xfs_trans_resv_calc,
+DECLARE_EVENT_CLASS(xfs_trans_resv_class,
TP_PROTO(struct xfs_mount *mp, unsigned int type,
struct xfs_trans_res *res),
TP_ARGS(mp, type, res),
@@ -3537,6 +3529,33 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logres,
__entry->logcount,
__entry->logflags)
+)
+
+#define DEFINE_TRANS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_trans_resv_class, name, \
+ TP_PROTO(struct xfs_mount *mp, unsigned int type, \
+ struct xfs_trans_res *res), \
+ TP_ARGS(mp, type, res))
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
+
+TRACE_EVENT(xfs_log_get_max_trans_res,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
+ TP_ARGS(mp, res),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint, logres)
+ __field(int, logcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->logres = res->tr_logres;
+ __entry->logcount = res->tr_logcount;
+ ),
+ TP_printk("dev %d:%d logres %u logcount %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->logres,
+ __entry->logcount)
);
DECLARE_EVENT_CLASS(xfs_trans_class,
@@ -3653,7 +3672,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \
TP_ARGS(ip))
DEFINE_AGINODE_EVENT(xfs_iunlink);
DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
-DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
TP_PROTO(struct xfs_mount *mp, unsigned int flags),
@@ -4111,6 +4129,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
+TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_DONE);
+
DECLARE_EVENT_CLASS(xfs_das_state_class,
TP_PROTO(int das, struct xfs_inode *ip),
TP_ARGS(das, ip),
@@ -4122,8 +4161,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class,
__entry->das = das;
__entry->ino = ip->i_ino;
),
- TP_printk("state change %d ino 0x%llx",
- __entry->das, __entry->ino)
+ TP_printk("state change %s ino 0x%llx",
+ __print_symbolic(__entry->das, XFS_DAS_STRINGS),
+ __entry->ino)
)
#define DEFINE_DAS_STATE_EVENT(name) \
@@ -4132,9 +4172,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \
TP_ARGS(das, ip))
DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
+
TRACE_EVENT(xfs_force_shutdown,
TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 0ac717aad380..7bd16fbff534 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -32,7 +32,6 @@ static void
xfs_trans_trace_reservations(
struct xfs_mount *mp)
{
- struct xfs_trans_res resv;
struct xfs_trans_res *res;
struct xfs_trans_res *end_res;
int i;
@@ -41,8 +40,6 @@ xfs_trans_trace_reservations(
end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
for (i = 0; res < end_res; i++, res++)
trace_xfs_trans_resv_calc(mp, i, res);
- xfs_log_get_max_trans_res(mp, &resv);
- trace_xfs_trans_resv_calc(mp, -1, &resv);
}
#else
# define xfs_trans_trace_reservations(mp)
@@ -194,11 +191,9 @@ xfs_trans_reserve(
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(mp,
- resp->tr_logres,
+ error = xfs_log_reserve(mp, resp->tr_logres,
resp->tr_logcount,
- &tp->t_ticket, XFS_TRANSACTION,
- permanent);
+ &tp->t_ticket, permanent);
}
if (error)
@@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas(
be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
}
- if (tp->t_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
- if (tp->t_res_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+ /*
+ * Updating frextents requires careful handling because it does not
+ * behave like the lazysb counters because we cannot rely on log
+ * recovery in older kenels to recompute the value from the rtbitmap.
+ * This means that the ondisk frextents must be consistent with the
+ * rtbitmap.
+ *
+ * Therefore, log the frextents change to the ondisk superblock and
+ * update the incore superblock so that future calls to xfs_log_sb
+ * write the correct value ondisk.
+ *
+ * Don't touch m_frextents because it includes incore reservations,
+ * and those are handled by the unreserve function.
+ */
+ if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+ struct xfs_mount *mp = tp->t_mountp;
+ int64_t rtxdelta;
+
+ rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+ spin_lock(&mp->m_sb_lock);
+ be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+ mp->m_sb.sb_frextents += rtxdelta;
+ spin_unlock(&mp->m_sb_lock);
+ }
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb(
if (ifreedelta)
percpu_counter_add(&mp->m_ifree, ifreedelta);
- if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+ if (rtxdelta) {
+ error = xfs_mod_frextents(mp, rtxdelta);
+ ASSERT(!error);
+ }
+
+ if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
/* apply remaining deltas */
@@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
mp->m_sb.sb_icount += idelta;
mp->m_sb.sb_ifree += ifreedelta;
- mp->m_sb.sb_frextents += rtxdelta;
+ /*
+ * Do not touch sb_frextents here because we are dealing with incore
+ * reservation. sb_frextents is not part of the lazy sb counters so it
+ * must be consistent with the ondisk rtbitmap and must never include
+ * incore reservations.
+ */
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
@@ -734,7 +760,7 @@ xfs_log_item_batch_insert(
void
xfs_trans_committed_bulk(
struct xfs_ail *ailp,
- struct xfs_log_vec *log_vector,
+ struct list_head *lv_chain,
xfs_lsn_t commit_lsn,
bool aborted)
{
@@ -749,7 +775,7 @@ xfs_trans_committed_bulk(
spin_unlock(&ailp->ail_lock);
/* unpin all the log items */
- for (lv = log_vector; lv; lv = lv->lv_next ) {
+ list_for_each_entry(lv, lv_chain, lv_list) {
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
@@ -819,6 +845,90 @@ xfs_trans_committed_bulk(
}
/*
+ * Sort transaction items prior to running precommit operations. This will
+ * attempt to order the items such that they will always be locked in the same
+ * order. Items that have no sort function are moved to the end of the list
+ * and so are locked last.
+ *
+ * This may need refinement as different types of objects add sort functions.
+ *
+ * Function is more complex than it needs to be because we are comparing 64 bit
+ * values and the function only returns 32 bit values.
+ */
+static int
+xfs_trans_precommit_sort(
+ void *unused_arg,
+ const struct list_head *a,
+ const struct list_head *b)
+{
+ struct xfs_log_item *lia = container_of(a,
+ struct xfs_log_item, li_trans);
+ struct xfs_log_item *lib = container_of(b,
+ struct xfs_log_item, li_trans);
+ int64_t diff;
+
+ /*
+ * If both items are non-sortable, leave them alone. If only one is
+ * sortable, move the non-sortable item towards the end of the list.
+ */
+ if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort)
+ return 0;
+ if (!lia->li_ops->iop_sort)
+ return 1;
+ if (!lib->li_ops->iop_sort)
+ return -1;
+
+ diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib);
+ if (diff < 0)
+ return -1;
+ if (diff > 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * Run transaction precommit functions.
+ *
+ * If there is an error in any of the callouts, then stop immediately and
+ * trigger a shutdown to abort the transaction. There is no recovery possible
+ * from errors at this point as the transaction is dirty....
+ */
+static int
+xfs_trans_run_precommits(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_log_item *lip, *n;
+ int error = 0;
+
+ /*
+ * Sort the item list to avoid ABBA deadlocks with other transactions
+ * running precommit operations that lock multiple shared items such as
+ * inode cluster buffers.
+ */
+ list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort);
+
+ /*
+ * Precommit operations can remove the log item from the transaction
+ * if the log item exists purely to delay modifications until they
+ * can be ordered against other operations. Hence we have to use
+ * list_for_each_entry_safe() here.
+ */
+ list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
+ continue;
+ if (lip->li_ops->iop_precommit) {
+ error = lip->li_ops->iop_precommit(tp, lip);
+ if (error)
+ break;
+ }
+ }
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+}
+
+/*
* Commit the given transaction to the log.
*
* XFS disk error handling mechanism is not based on a typical
@@ -843,6 +953,13 @@ __xfs_trans_commit(
trace_xfs_trans_commit(tp, _RET_IP_);
+ error = xfs_trans_run_precommits(tp);
+ if (error) {
+ if (tp->t_flags & XFS_TRANS_PERM_LOG_RES)
+ xfs_defer_cancel(tp);
+ goto out_unreserve;
+ }
+
/*
* Finish deferred items on final commit. Only permanent transactions
* should ever have deferred ops.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index de177842b951..55819785941c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -45,6 +45,7 @@ struct xfs_log_item {
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_csn_t li_seq; /* CIL commit seq */
+ uint32_t li_order_id; /* CIL commit order */
};
/*
@@ -55,13 +56,15 @@ struct xfs_log_item {
#define XFS_LI_IN_AIL 0
#define XFS_LI_ABORTED 1
#define XFS_LI_FAILED 2
-#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
+#define XFS_LI_DIRTY 3
+#define XFS_LI_WHITEOUT 4
#define XFS_LI_FLAGS \
- { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
- { (1 << XFS_LI_ABORTED), "ABORTED" }, \
- { (1 << XFS_LI_FAILED), "FAILED" }, \
- { (1 << XFS_LI_DIRTY), "DIRTY" }
+ { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \
+ { (1u << XFS_LI_ABORTED), "ABORTED" }, \
+ { (1u << XFS_LI_FAILED), "FAILED" }, \
+ { (1u << XFS_LI_DIRTY), "DIRTY" }, \
+ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }
struct xfs_item_ops {
unsigned flags;
@@ -69,39 +72,43 @@ struct xfs_item_ops {
void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
void (*iop_pin)(struct xfs_log_item *);
void (*iop_unpin)(struct xfs_log_item *, int remove);
- uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+ uint64_t (*iop_sort)(struct xfs_log_item *lip);
+ int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip);
void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
- void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+ uint (*iop_push)(struct xfs_log_item *, struct list_head *);
+ void (*iop_release)(struct xfs_log_item *);
int (*iop_recover)(struct xfs_log_item *lip,
struct list_head *capture_list);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
struct xfs_trans *tp);
+ struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
-/* Is this log item a deferred action intent? */
+/*
+ * Log item ops flags
+ */
+/*
+ * Release the log item when the journal commits instead of inserting into the
+ * AIL for writeback tracking and/or log tail pinning.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
+#define XFS_ITEM_INTENT (1 << 1)
+#define XFS_ITEM_INTENT_DONE (1 << 2)
+
static inline bool
xlog_item_is_intent(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_recover != NULL &&
- lip->li_ops->iop_match != NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT;
}
-/* Is this a log intent-done item? */
static inline bool
xlog_item_is_intent_done(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_unpin == NULL &&
- lip->li_ops->iop_push == NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
}
-/*
- * Release the log item as soon as committed. This is for items just logging
- * intents that never need to be written back in place.
- */
-#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
-
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
int type, const struct xfs_item_ops *ops);
@@ -175,7 +182,7 @@ xfs_trans_get_buf(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
int numblks,
- uint flags,
+ xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 9ba7e6b9bed3..aa00cf67ad72 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -597,13 +597,11 @@ xfs_dqresv_check(
if (softlimit && total_count > softlimit) {
time64_t now = ktime_get_real_seconds();
- if ((res->timer != 0 && now > res->timer) ||
- (res->warnings != 0 && res->warnings >= qlim->warn)) {
+ if (res->timer != 0 && now > res->timer) {
*fatal = true;
return QUOTA_NL_ISOFTLONGWARN;
}
- res->warnings++;
return QUOTA_NL_ISOFTWARN;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index f0d79a9050ba..d5400150358e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -19,7 +19,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
-void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+void xfs_trans_committed_bulk(struct xfs_ail *ailp,
+ struct list_head *lv_chain,
xfs_lsn_t commit_lsn, bool aborted);
/*
* AIL traversal cursor.
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 0d050f8829ef..c325a28b89a8 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -12,12 +12,104 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_acl.h"
-#include "xfs_da_btree.h"
+#include "xfs_log.h"
+#include "xfs_xattr.h"
#include <linux/posix_acl_xattr.h>
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions or hold any inode locks, and
+ * they must release the permission by calling xlog_drop_incompat_feat
+ * when they're done.
+ */
+static inline int
+xfs_attr_grab_log_assist(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /*
+ * Protect ourselves from an idle log clearing the logged xattrs log
+ * incompat feature bit.
+ */
+ xlog_use_incompat_feat(mp->m_log);
+
+ /*
+ * If log-assisted xattrs are already enabled, the caller can use the
+ * log assisted swap functions with the log-incompat reference we got.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return 0;
+
+ /* Enable log-assisted xattrs. */
+ error = xfs_add_incompat_log_feature(mp,
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+ if (error)
+ goto drop_incompat;
+
+ xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP,
+ "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!");
+
+ return 0;
+drop_incompat:
+ xlog_drop_incompat_feat(mp->m_log);
+ return error;
+}
+
+static inline void
+xfs_attr_rele_log_assist(
+ struct xfs_mount *mp)
+{
+ xlog_drop_incompat_feat(mp->m_log);
+}
+
+static inline bool
+xfs_attr_want_log_assist(
+ struct xfs_mount *mp)
+{
+#ifdef DEBUG
+ /* Logged xattrs require a V5 super for log_incompat */
+ return xfs_has_crc(mp) && xfs_globals.larp;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Set or remove an xattr, having grabbed the appropriate logging resources
+ * prior to calling libxfs.
+ */
+int
+xfs_attr_change(
+ struct xfs_da_args *args)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+ bool use_logging = false;
+ int error;
+
+ ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED));
+
+ if (xfs_attr_want_log_assist(mp)) {
+ error = xfs_attr_grab_log_assist(mp);
+ if (error)
+ return error;
+
+ args->op_flags |= XFS_DA_OP_LOGGED;
+ use_logging = true;
+ }
+
+ error = xfs_attr_set(args);
+
+ if (use_logging)
+ xfs_attr_rele_log_assist(mp);
+ return error;
+}
+
static int
xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
@@ -56,7 +148,7 @@ xfs_xattr_set(const struct xattr_handler *handler,
};
int error;
- error = xfs_attr_set(&args);
+ error = xfs_attr_change(&args);
if (!error && (handler->flags & XFS_ATTR_ROOT))
xfs_forget_acl(inode, name);
return error;
diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h
new file mode 100644
index 000000000000..2b09133b1b9b
--- /dev/null
+++ b/fs/xfs/xfs_xattr.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_XATTR_H__
+#define __XFS_XATTR_H__
+
+int xfs_attr_change(struct xfs_da_args *args);
+
+extern const struct xattr_handler *xfs_xattr_handlers[];
+
+#endif /* __XFS_XATTR_H__ */
diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile
index 33c1a4f1132e..9fe54f5319f2 100644
--- a/fs/zonefs/Makefile
+++ b/fs/zonefs/Makefile
@@ -3,4 +3,4 @@ ccflags-y += -I$(src)
obj-$(CONFIG_ZONEFS_FS) += zonefs.o
-zonefs-y := super.o
+zonefs-y := super.o sysfs.o
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 3614c7834007..860f0b1032c6 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -27,14 +27,57 @@
#define CREATE_TRACE_POINTS
#include "trace.h"
-static inline int zonefs_zone_mgmt(struct inode *inode,
- enum req_opf op)
+/*
+ * Manage the active zone count. Called with zi->i_truncate_mutex held.
+ */
+static void zonefs_account_active(struct inode *inode)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ lockdep_assert_held(&zi->i_truncate_mutex);
+
+ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+ return;
+
+ /*
+ * If the zone is active, that is, if it is explicitly open or
+ * partially written, check if it was already accounted as active.
+ */
+ if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
+ (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
+ if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
+ zi->i_flags |= ZONEFS_ZONE_ACTIVE;
+ atomic_inc(&sbi->s_active_seq_files);
+ }
+ return;
+ }
+
+ /* The zone is not active. If it was, update the active count */
+ if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
+ zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
+ atomic_dec(&sbi->s_active_seq_files);
+ }
+}
+
+static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
int ret;
lockdep_assert_held(&zi->i_truncate_mutex);
+ /*
+ * With ZNS drives, closing an explicitly open zone that has not been
+ * written will change the zone state to "closed", that is, the zone
+ * will remain active. Since this can then cause failure of explicit
+ * open operation on other zones if the drive active zone resources
+ * are exceeded, make sure that the zone does not remain active by
+ * resetting it.
+ */
+ if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
+ op = REQ_OP_ZONE_RESET;
+
trace_zonefs_zone_mgmt(inode, op);
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
@@ -57,19 +100,60 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
* A full zone is no longer open/active and does not need
* explicit closing.
*/
- if (isize >= zi->i_max_size)
- zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+ if (isize >= zi->i_max_size) {
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+
+ if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
+ atomic_dec(&sbi->s_active_seq_files);
+ zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
+ }
+}
+
+static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ loff_t isize;
+
+ /*
+ * All blocks are always mapped below EOF. If reading past EOF,
+ * act as if there is a hole up to the file maximum size.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ isize = i_size_read(inode);
+ if (iomap->offset >= isize) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+ iomap->length = isize - iomap->offset;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ trace_zonefs_iomap_begin(inode, iomap);
+
+ return 0;
}
-static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
+static const struct iomap_ops zonefs_read_iomap_ops = {
+ .iomap_begin = zonefs_read_iomap_begin,
+};
+
+static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
loff_t isize;
- /* All I/Os should always be within the file maximum size */
+ /* All write I/Os should always be within the file maximum size */
if (WARN_ON_ONCE(offset + length > zi->i_max_size))
return -EIO;
@@ -79,7 +163,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* operation.
*/
if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
- (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)))
+ !(flags & IOMAP_DIRECT)))
return -EIO;
/*
@@ -88,47 +172,44 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* write pointer) and unwriten beyond.
*/
mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
isize = i_size_read(inode);
- if (offset >= isize)
+ if (iomap->offset >= isize) {
iomap->type = IOMAP_UNWRITTEN;
- else
+ iomap->length = zi->i_max_size - iomap->offset;
+ } else {
iomap->type = IOMAP_MAPPED;
- if (flags & IOMAP_WRITE)
- length = zi->i_max_size - offset;
- else
- length = min(length, isize - offset);
+ iomap->length = isize - iomap->offset;
+ }
mutex_unlock(&zi->i_truncate_mutex);
- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
- iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
-
trace_zonefs_iomap_begin(inode, iomap);
return 0;
}
-static const struct iomap_ops zonefs_iomap_ops = {
- .iomap_begin = zonefs_iomap_begin,
+static const struct iomap_ops zonefs_write_iomap_ops = {
+ .iomap_begin = zonefs_write_iomap_begin,
};
-static int zonefs_readpage(struct file *unused, struct page *page)
+static int zonefs_read_folio(struct file *unused, struct folio *folio)
{
- return iomap_readpage(page, &zonefs_iomap_ops);
+ return iomap_read_folio(folio, &zonefs_read_iomap_ops);
}
static void zonefs_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &zonefs_iomap_ops);
+ iomap_readahead(rac, &zonefs_read_iomap_ops);
}
/*
* Map blocks for page writeback. This is used only on conventional zone files,
* which implies that the page range can only be within the fixed inode size.
*/
-static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
@@ -142,21 +223,14 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
offset < wpc->iomap.offset + wpc->iomap.length)
return 0;
- return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+ return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_map_blocks,
+ .map_blocks = zonefs_write_map_blocks,
};
-static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct iomap_writepage_ctx wpc = { };
-
- return iomap_writepage(page, wbc, &wpc, &zonefs_writeback_ops);
-}
-
static int zonefs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -177,18 +251,18 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,
return -EINVAL;
}
- return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
+ return iomap_swapfile_activate(sis, swap_file, span,
+ &zonefs_read_iomap_ops);
}
static const struct address_space_operations zonefs_file_aops = {
- .readpage = zonefs_readpage,
+ .read_folio = zonefs_read_folio,
.readahead = zonefs_readahead,
- .writepage = zonefs_writepage,
.writepages = zonefs_writepages,
.dirty_folio = filemap_dirty_folio,
- .releasepage = iomap_releasepage,
+ .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
- .migratepage = iomap_migrate_page,
+ .migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.direct_IO = noop_direct_IO,
@@ -386,6 +460,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_update_stats(inode, data_size);
zonefs_i_size_write(inode, data_size);
zi->i_wpoffset = data_size;
+ zonefs_account_active(inode);
return 0;
}
@@ -441,7 +516,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
loff_t old_isize;
- enum req_opf op;
+ enum req_op op;
int ret = 0;
/*
@@ -497,6 +572,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize)
zonefs_update_stats(inode, isize);
truncate_setsize(inode, isize);
zi->i_wpoffset = isize;
+ zonefs_account_active(inode);
unlock:
mutex_unlock(&zi->i_truncate_mutex);
@@ -531,7 +607,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
!uid_eq(iattr->ia_uid, inode->i_uid)) ||
((iattr->ia_valid & ATTR_GID) &&
!gid_eq(iattr->ia_gid, inode->i_gid))) {
- ret = dquot_transfer(inode, iattr);
+ ret = dquot_transfer(mnt_userns, inode, iattr);
if (ret)
return ret;
}
@@ -596,7 +672,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
@@ -678,13 +754,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int max;
+ unsigned int max = bdev_max_zone_append_sectors(bdev);
struct bio *bio;
ssize_t size;
int nr_pages;
ssize_t ret;
- max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
iov_iter_truncate(from, max);
@@ -696,7 +771,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
bio->bi_iter.bi_sector = zi->i_zsector;
bio->bi_ioprio = iocb->ki_ioprio;
- if (iocb->ki_flags & IOCB_DSYNC)
+ if (iocb_is_dsync(iocb))
bio->bi_opf |= REQ_FUA;
ret = bio_iov_iter_get_pages(bio, from);
@@ -849,14 +924,21 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (append)
ret = zonefs_file_dio_append(iocb, from);
else
- ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, 0, 0);
+ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
+ &zonefs_write_dio_ops, 0, NULL, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
count = ret;
+
+ /*
+ * Update the zone write pointer offset assuming the write
+ * operation succeeded. If it did not, the error recovery path
+ * will correct it. Also do active seq file accounting.
+ */
mutex_lock(&zi->i_truncate_mutex);
zi->i_wpoffset += count;
+ zonefs_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -891,7 +973,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
if (ret > 0)
iocb->ki_pos += ret;
else if (ret == -EIO)
@@ -984,8 +1066,8 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto inode_unlock;
}
file_accessed(iocb->ki_filp);
- ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, 0, 0);
+ ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
+ &zonefs_read_dio_ops, 0, NULL, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
@@ -998,13 +1080,13 @@ inode_unlock:
return ret;
}
-static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
+/*
+ * Write open accounting is done only for sequential files.
+ */
+static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+ struct file *file)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
-
- if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
- return false;
if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
return false;
@@ -1015,28 +1097,35 @@ static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *fi
return true;
}
-static int zonefs_open_zone(struct inode *inode)
+static int zonefs_seq_file_write_open(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
if (!zi->i_wr_refcnt) {
- if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
- atomic_dec(&sbi->s_open_zones);
- ret = -EBUSY;
- goto unlock;
- }
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
- if (i_size_read(inode) < zi->i_max_size) {
- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
- if (ret) {
- atomic_dec(&sbi->s_open_zones);
+ if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+
+ if (sbi->s_max_wro_seq_files
+ && wro > sbi->s_max_wro_seq_files) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ ret = -EBUSY;
goto unlock;
}
- zi->i_flags |= ZONEFS_ZONE_OPEN;
+
+ if (i_size_read(inode) < zi->i_max_size) {
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ if (ret) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ goto unlock;
+ }
+ zi->i_flags |= ZONEFS_ZONE_OPEN;
+ zonefs_account_active(inode);
+ }
}
}
@@ -1056,30 +1145,31 @@ static int zonefs_file_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- if (zonefs_file_use_exp_open(inode, file))
- return zonefs_open_zone(inode);
+ if (zonefs_seq_file_need_wro(inode, file))
+ return zonefs_seq_file_write_open(inode);
return 0;
}
-static void zonefs_close_zone(struct inode *inode)
+static void zonefs_seq_file_write_close(struct inode *inode)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
int ret = 0;
mutex_lock(&zi->i_truncate_mutex);
- zi->i_wr_refcnt--;
- if (!zi->i_wr_refcnt) {
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- struct super_block *sb = inode->i_sb;
- /*
- * If the file zone is full, it is not open anymore and we only
- * need to decrement the open count.
- */
- if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
- goto dec;
+ zi->i_wr_refcnt--;
+ if (zi->i_wr_refcnt)
+ goto unlock;
+ /*
+ * The file zone may not be open anymore (e.g. the file was truncated to
+ * its maximum size or it was fully written). For this case, we only
+ * need to decrement the write open count.
+ */
+ if (zi->i_flags & ZONEFS_ZONE_OPEN) {
ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
if (ret) {
__zonefs_io_error(inode, false);
@@ -1091,14 +1181,23 @@ static void zonefs_close_zone(struct inode *inode)
*/
if (zi->i_flags & ZONEFS_ZONE_OPEN &&
!(sb->s_flags & SB_RDONLY)) {
- zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
+ zonefs_warn(sb,
+ "closing zone at %llu failed %d\n",
+ zi->i_zsector, ret);
+ zonefs_warn(sb,
+ "remounting filesystem read-only\n");
sb->s_flags |= SB_RDONLY;
}
+ goto unlock;
}
+
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
-dec:
- atomic_dec(&sbi->s_open_zones);
+ zonefs_account_active(inode);
}
+
+ atomic_dec(&sbi->s_wro_seq_files);
+
+unlock:
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -1110,8 +1209,8 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
* the zone has gone offline or read-only). Make sure we don't fail the
* close(2) for user-space.
*/
- if (zonefs_file_use_exp_open(inode, file))
- zonefs_close_zone(inode);
+ if (zonefs_seq_file_need_wro(inode, file))
+ zonefs_seq_file_write_close(inode);
return 0;
}
@@ -1142,6 +1241,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
zi->i_wr_refcnt = 0;
+ zi->i_flags = 0;
return &zi->i_vnode;
}
@@ -1285,7 +1385,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
{
struct super_block *sb = parent->i_sb;
- inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
+ inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1;
inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
inode->i_op = &zonefs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -1293,12 +1393,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
inc_nlink(parent);
}
-static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
- enum zonefs_ztype type)
+static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+ enum zonefs_ztype type)
{
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret = 0;
inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
inode->i_mode = S_IFREG | sbi->s_perm;
@@ -1323,6 +1424,29 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+
+ mutex_lock(&zi->i_truncate_mutex);
+
+ /*
+ * For sequential zones, make sure that any open zone is closed first
+ * to ensure that the initial number of open zones is 0, in sync with
+ * the open zone accounting done when the mount option
+ * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+ */
+ if (type == ZONEFS_ZTYPE_SEQ &&
+ (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+ zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ if (ret)
+ goto unlock;
+ }
+
+ zonefs_account_active(inode);
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ return ret;
}
static struct dentry *zonefs_create_inode(struct dentry *parent,
@@ -1332,6 +1456,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
struct inode *dir = d_inode(parent);
struct dentry *dentry;
struct inode *inode;
+ int ret;
dentry = d_alloc_name(parent, name);
if (!dentry)
@@ -1342,10 +1467,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
goto dput;
inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
- if (zone)
- zonefs_init_file_inode(inode, zone, type);
- else
+ if (zone) {
+ ret = zonefs_init_file_inode(inode, zone, type);
+ if (ret) {
+ iput(inode);
+ goto dput;
+ }
+ } else {
zonefs_init_dir_inode(dir, inode, type);
+ }
+
d_add(dentry, inode);
dir->i_size++;
@@ -1400,7 +1531,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
/*
* The first zone contains the super block: skip it.
*/
- end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk);
+ end = zd->zones + bdev_nr_zones(sb->s_bdev);
for (zone = &zd->zones[1]; zone < end; zone = next) {
next = zone + 1;
@@ -1495,8 +1626,8 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
struct block_device *bdev = zd->sb->s_bdev;
int ret;
- zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk),
- sizeof(struct blk_zone), GFP_KERNEL);
+ zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone),
+ GFP_KERNEL);
if (!zd->zones)
return -ENOMEM;
@@ -1508,9 +1639,9 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
return ret;
}
- if (ret != blkdev_nr_zones(bdev->bd_disk)) {
+ if (ret != bdev_nr_zones(bdev)) {
zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n",
- ret, blkdev_nr_zones(bdev->bd_disk));
+ ret, bdev_nr_zones(bdev));
return -EIO;
}
@@ -1547,11 +1678,11 @@ static int zonefs_read_super(struct super_block *sb)
if (ret)
goto free_page;
- super = kmap(page);
+ super = page_address(page);
ret = -EINVAL;
if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
- goto unmap;
+ goto free_page;
stored_crc = le32_to_cpu(super->s_crc);
super->s_crc = 0;
@@ -1559,14 +1690,14 @@ static int zonefs_read_super(struct super_block *sb)
if (crc != stored_crc) {
zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
crc, stored_crc);
- goto unmap;
+ goto free_page;
}
sbi->s_features = le64_to_cpu(super->s_features);
if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
zonefs_err(sb, "Unknown features set 0x%llx\n",
sbi->s_features);
- goto unmap;
+ goto free_page;
}
if (sbi->s_features & ZONEFS_F_UID) {
@@ -1574,7 +1705,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_uid));
if (!uid_valid(sbi->s_uid)) {
zonefs_err(sb, "Invalid UID feature\n");
- goto unmap;
+ goto free_page;
}
}
@@ -1583,7 +1714,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_gid));
if (!gid_valid(sbi->s_gid)) {
zonefs_err(sb, "Invalid GID feature\n");
- goto unmap;
+ goto free_page;
}
}
@@ -1592,14 +1723,12 @@ static int zonefs_read_super(struct super_block *sb)
if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
zonefs_err(sb, "Reserved area is being used\n");
- goto unmap;
+ goto free_page;
}
import_uuid(&sbi->s_uuid, super->s_uuid);
ret = 0;
-unmap:
- kunmap(page);
free_page:
__free_page(page);
@@ -1652,13 +1781,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_gid = GLOBAL_ROOT_GID;
sbi->s_perm = 0640;
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
- sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
- atomic_set(&sbi->s_open_zones, 0);
- if (!sbi->s_max_open_zones &&
- sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
- zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
- }
+
+ atomic_set(&sbi->s_wro_seq_files, 0);
+ sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
+ atomic_set(&sbi->s_active_seq_files, 0);
+ sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
ret = zonefs_read_super(sb);
if (ret)
@@ -1674,8 +1801,19 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
if (ret)
goto cleanup;
- zonefs_info(sb, "Mounting %u zones",
- blkdev_nr_zones(sb->s_bdev->bd_disk));
+ ret = zonefs_sysfs_register(sb);
+ if (ret)
+ goto cleanup;
+
+ zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev));
+
+ if (!sbi->s_max_wro_seq_files &&
+ !sbi->s_max_active_seq_files &&
+ sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+ zonefs_info(sb,
+ "No open and active zone limits. Ignoring explicit_open mount option\n");
+ sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ }
/* Create root directory inode */
ret = -ENOMEM;
@@ -1683,7 +1821,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
if (!inode)
goto cleanup;
- inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk);
+ inode->i_ino = bdev_nr_zones(sb->s_bdev);
inode->i_mode = S_IFDIR | 0555;
inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
inode->i_op = &zonefs_dir_inode_operations;
@@ -1719,6 +1857,8 @@ static void zonefs_kill_super(struct super_block *sb)
if (sb->s_root)
d_genocide(sb->s_root);
+
+ zonefs_sysfs_unregister(sb);
kill_block_super(sb);
kfree(sbi);
}
@@ -1766,16 +1906,26 @@ static int __init zonefs_init(void)
return ret;
ret = register_filesystem(&zonefs_type);
- if (ret) {
- zonefs_destroy_inodecache();
- return ret;
- }
+ if (ret)
+ goto destroy_inodecache;
+
+ ret = zonefs_sysfs_init();
+ if (ret)
+ goto unregister_fs;
return 0;
+
+unregister_fs:
+ unregister_filesystem(&zonefs_type);
+destroy_inodecache:
+ zonefs_destroy_inodecache();
+
+ return ret;
}
static void __exit zonefs_exit(void)
{
+ zonefs_sysfs_exit();
zonefs_destroy_inodecache();
unregister_filesystem(&zonefs_type);
}
diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c
new file mode 100644
index 000000000000..9cb6755ce39a
--- /dev/null
+++ b/fs/zonefs/sysfs.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Simple file system for zoned block devices exposing zones as files.
+ *
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+
+#include "zonefs.h"
+
+struct zonefs_sysfs_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf);
+};
+
+static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr)
+{
+ return container_of(attr, struct zonefs_sysfs_attr, attr);
+}
+
+#define ZONEFS_SYSFS_ATTR_RO(name) \
+static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name)
+
+#define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr
+
+static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct zonefs_sb_info *sbi =
+ container_of(kobj, struct zonefs_sb_info, s_kobj);
+ struct zonefs_sysfs_attr *zonefs_attr =
+ container_of(attr, struct zonefs_sysfs_attr, attr);
+
+ if (!zonefs_attr->show)
+ return 0;
+
+ return zonefs_attr->show(sbi, buf);
+}
+
+static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files);
+}
+ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files);
+
+static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files));
+}
+ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files);
+
+static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files);
+}
+ZONEFS_SYSFS_ATTR_RO(max_active_seq_files);
+
+static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files));
+}
+ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files);
+
+static struct attribute *zonefs_sysfs_attrs[] = {
+ ATTR_LIST(max_wro_seq_files),
+ ATTR_LIST(nr_wro_seq_files),
+ ATTR_LIST(max_active_seq_files),
+ ATTR_LIST(nr_active_seq_files),
+ NULL,
+};
+ATTRIBUTE_GROUPS(zonefs_sysfs);
+
+static void zonefs_sysfs_sb_release(struct kobject *kobj)
+{
+ struct zonefs_sb_info *sbi =
+ container_of(kobj, struct zonefs_sb_info, s_kobj);
+
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops zonefs_sysfs_attr_ops = {
+ .show = zonefs_sysfs_attr_show,
+};
+
+static struct kobj_type zonefs_sb_ktype = {
+ .default_groups = zonefs_sysfs_groups,
+ .sysfs_ops = &zonefs_sysfs_attr_ops,
+ .release = zonefs_sysfs_sb_release,
+};
+
+static struct kobject *zonefs_sysfs_root;
+
+int zonefs_sysfs_register(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ int ret;
+
+ init_completion(&sbi->s_kobj_unregister);
+ ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype,
+ zonefs_sysfs_root, "%s", sb->s_id);
+ if (ret) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ return ret;
+ }
+
+ sbi->s_sysfs_registered = true;
+
+ return 0;
+}
+
+void zonefs_sysfs_unregister(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+
+ if (!sbi || !sbi->s_sysfs_registered)
+ return;
+
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+}
+
+int __init zonefs_sysfs_init(void)
+{
+ zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj);
+ if (!zonefs_sysfs_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void zonefs_sysfs_exit(void)
+{
+ kobject_put(zonefs_sysfs_root);
+ zonefs_sysfs_root = NULL;
+}
diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h
index f369d7d50303..42edcfd393ed 100644
--- a/fs/zonefs/trace.h
+++ b/fs/zonefs/trace.h
@@ -20,12 +20,12 @@
#define show_dev(dev) MAJOR(dev), MINOR(dev)
TRACE_EVENT(zonefs_zone_mgmt,
- TP_PROTO(struct inode *inode, enum req_opf op),
+ TP_PROTO(struct inode *inode, enum req_op op),
TP_ARGS(inode, op),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
- __field(int, op)
+ __field(enum req_op, op)
__field(sector_t, sector)
__field(sector_t, nr_sectors)
),
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 7b147907c328..4b3de66c3233 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -12,6 +12,7 @@
#include <linux/uuid.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
+#include <linux/kobject.h>
/*
* Maximum length of file names: this only needs to be large enough to fit
@@ -39,6 +40,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
}
#define ZONEFS_ZONE_OPEN (1 << 0)
+#define ZONEFS_ZONE_ACTIVE (1 << 1)
/*
* In-memory inode data.
@@ -182,8 +184,15 @@ struct zonefs_sb_info {
loff_t s_blocks;
loff_t s_used_blocks;
- unsigned int s_max_open_zones;
- atomic_t s_open_zones;
+ unsigned int s_max_wro_seq_files;
+ atomic_t s_wro_seq_files;
+
+ unsigned int s_max_active_seq_files;
+ atomic_t s_active_seq_files;
+
+ bool s_sysfs_registered;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
};
static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
@@ -198,4 +207,9 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
#define zonefs_warn(sb, format, args...) \
pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
+int zonefs_sysfs_register(struct super_block *sb);
+void zonefs_sysfs_unregister(struct super_block *sb);
+int zonefs_sysfs_init(void);
+void zonefs_sysfs_exit(void);
+
#endif